diff --git a/common/config_models.py b/common/config_models.py index fff248e..2892b2c 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -80,7 +80,7 @@ class NetworkConfig(BaseConfigModel): ), ) api_servers: Optional[List[Literal["OAI", "Kobold"]]] = Field( - default_factory=list, + ["OAI"], description=( 'Select API servers to enable (default: ["OAI"]).\n' "Possible values: OAI, Kobold." diff --git a/config_sample.yml b/config_sample.yml index bd790db..10e812f 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -1,216 +1,216 @@ -# Sample YAML file for configuration. -# Comment and uncomment values as needed. -# Every value has a default within the application. -# This file serves to be a drop in for config.yml - -# Unless specified in the comments, DO NOT put these options in quotes! -# You can use https://www.yamllint.com/ if you want to check your YAML formatting. - -# Options for networking -network: - # The IP to host on (default: 127.0.0.1). - # Use 0.0.0.0 to expose on all network adapters. - host: 127.0.0.1 - - # The port to host on (default: 5000). - port: 5000 - - # Disable HTTP token authentication with requests. - # WARNING: This will make your instance vulnerable! - # Turn on this option if you are ONLY connecting from localhost. - disable_auth: False - - # Send tracebacks over the API (default: False). - # NOTE: Only enable this for debug purposes. - send_tracebacks: False - - # Select API servers to enable (default: ["OAI"]). - # Possible values: OAI, Kobold. - api_servers: [] - -# Options for logging -logging: - # Enable prompt logging (default: False). - log_prompt: False - - # Enable generation parameter logging (default: False). - log_generation_params: False - - # Enable request logging (default: False). - # NOTE: Only use this for debugging! - log_requests: False - -# Options for model overrides and loading -# Please read the comments to understand how arguments are handled -# between initial and API loads -model: - # Directory to look for models (default: models). - # Windows users, do NOT put this path in quotes! - model_dir: models - - # Allow direct loading of models from a completion or chat completion request (default: False). - inline_model_loading: False - - # Sends dummy model names when the models endpoint is queried. - # Enable this if the client is looking for specific OAI models. - use_dummy_models: False - - # An initial model to load. - # Make sure the model is located in the model directory! - # REQUIRED: This must be filled out to load a model on startup. - model_name: - - # Names of args to use as a fallback for API load requests (default: []). - # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array. - # Example: ['max_seq_len', 'cache_mode']. - use_as_default: [] - - # Max sequence length (default: Empty). - # Fetched from the model's base sequence length in config.json by default. - max_seq_len: - - # Overrides base model context length (default: Empty). - # WARNING: Don't set this unless you know what you're doing! - # Again, do NOT use this for configuring context length, use max_seq_len above ^ - override_base_seq_len: - - # Load model with tensor parallelism. - # Falls back to autosplit if GPU split isn't provided. - # This ignores the gpu_split_auto value. - tensor_parallel: False - - # Automatically allocate resources to GPUs (default: True). - # Not parsed for single GPU users. - gpu_split_auto: True - - # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0). - # Represented as an array of MB per GPU. - autosplit_reserve: [96] - - # An integer array of GBs of VRAM to split between GPUs (default: []). - # Used with tensor parallelism. - gpu_split: [] - - # Rope scale (default: 1.0). - # Same as compress_pos_emb. - # Use if the model was trained on long context with rope. - # Leave blank to pull the value from the model. - rope_scale: 1.0 - - # Rope alpha (default: 1.0). - # Same as alpha_value. Set to "auto" to auto-calculate. - rope_alpha: 1.0 - - # Enable different cache modes for VRAM savings (default: FP16). - # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. - cache_mode: FP16 - - # Size of the prompt cache to allocate (default: max_seq_len). - # Must be a multiple of 256 and can't be less than max_seq_len. - # For CFG, set this to 2 * max_seq_len. - cache_size: - - # Chunk size for prompt ingestion (default: 2048). - # A lower value reduces VRAM usage but decreases ingestion speed. - # NOTE: Effects vary depending on the model. - # An ideal value is between 512 and 4096. - chunk_size: 2048 - - # Set the maximum number of prompts to process at one time (default: None/Automatic). - # Automatically calculated if left blank. - # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. - max_batch_size: - - # Set the prompt template for this model. (default: None) - # If empty, attempts to look for the model's chat template. - # If a model contains multiple templates in its tokenizer_config.json, - # set prompt_template to the name of the template you want to use. - # NOTE: Only works with chat completion message lists! - prompt_template: - - # Number of experts to use per token. - # Fetched from the model's config.json if empty. - # NOTE: For MoE models only. - # WARNING: Don't set this unless you know what you're doing! - num_experts_per_token: - - # Enables fasttensors to possibly increase model loading speeds (default: False). - fasttensors: False - -# Options for draft models (speculative decoding) -# This will use more VRAM! -draft_model: - # Directory to look for draft models (default: models) - draft_model_dir: models - - # An initial draft model to load. - # Ensure the model is in the model directory. - draft_model_name: - - # Rope scale for draft models (default: 1.0). - # Same as compress_pos_emb. - # Use if the draft model was trained on long context with rope. - draft_rope_scale: 1.0 - - # Rope alpha for draft models (default: None). - # Same as alpha_value. Set to "auto" to auto-calculate. - draft_rope_alpha: - - # Cache mode for draft models to save VRAM (default: FP16). - # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. - draft_cache_mode: FP16 - -# Options for Loras -lora: - # Directory to look for LoRAs (default: loras). - lora_dir: loras - - # List of LoRAs to load and associated scaling factors (default scale: 1.0). - # For the YAML file, add each entry as a YAML list: - # - name: lora1 - # scaling: 1.0 - loras: - -# Options for embedding models and loading. -# NOTE: Embeddings requires the "extras" feature to be installed -# Install it via "pip install .[extras]" -embeddings: - # Directory to look for embedding models (default: models). - embedding_model_dir: models - - # Device to load embedding models on (default: cpu). - # Possible values: cpu, auto, cuda. - # NOTE: It's recommended to load embedding models on the CPU. - # If using an AMD GPU, set this value to 'cuda'. - embeddings_device: cpu - - # An initial embedding model to load on the infinity backend. - embedding_model_name: - -# Options for Sampling -sampling: - # Select a sampler override preset (default: None). - # Find this in the sampler-overrides folder. - # This overrides default fallbacks for sampler values that are passed to the API. - override_preset: - -# Options for development and experimentation -developer: - # Skip Exllamav2 version check (default: False). - # WARNING: It's highly recommended to update your dependencies rather than enabling this flag. - unsafe_launch: False - - # Disable API request streaming (default: False). - disable_request_streaming: False - - # Enable the torch CUDA malloc backend (default: False). - cuda_malloc_backend: False - - # Run asyncio using Uvloop or Winloop which can improve performance. - # NOTE: It's recommended to enable this, but if something breaks turn this off. - uvloop: False - - # Set process to use a higher priority. - # For realtime process priority, run as administrator or sudo. - # Otherwise, the priority will be set to high. - realtime_process_priority: False +# Sample YAML file for configuration. +# Comment and uncomment values as needed. +# Every value has a default within the application. +# This file serves to be a drop in for config.yml + +# Unless specified in the comments, DO NOT put these options in quotes! +# You can use https://www.yamllint.com/ if you want to check your YAML formatting. + +# Options for networking +network: + # The IP to host on (default: 127.0.0.1). + # Use 0.0.0.0 to expose on all network adapters. + host: 127.0.0.1 + + # The port to host on (default: 5000). + port: 5000 + + # Disable HTTP token authentication with requests. + # WARNING: This will make your instance vulnerable! + # Turn on this option if you are ONLY connecting from localhost. + disable_auth: False + + # Send tracebacks over the API (default: False). + # NOTE: Only enable this for debug purposes. + send_tracebacks: False + + # Select API servers to enable (default: ["OAI"]). + # Possible values: OAI, Kobold. + api_servers: ['OAI'] + +# Options for logging +logging: + # Enable prompt logging (default: False). + log_prompt: False + + # Enable generation parameter logging (default: False). + log_generation_params: False + + # Enable request logging (default: False). + # NOTE: Only use this for debugging! + log_requests: False + +# Options for model overrides and loading +# Please read the comments to understand how arguments are handled +# between initial and API loads +model: + # Directory to look for models (default: models). + # Windows users, do NOT put this path in quotes! + model_dir: models + + # Allow direct loading of models from a completion or chat completion request (default: False). + inline_model_loading: False + + # Sends dummy model names when the models endpoint is queried. + # Enable this if the client is looking for specific OAI models. + use_dummy_models: False + + # An initial model to load. + # Make sure the model is located in the model directory! + # REQUIRED: This must be filled out to load a model on startup. + model_name: + + # Names of args to use as a fallback for API load requests (default: []). + # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array. + # Example: ['max_seq_len', 'cache_mode']. + use_as_default: [] + + # Max sequence length (default: Empty). + # Fetched from the model's base sequence length in config.json by default. + max_seq_len: + + # Overrides base model context length (default: Empty). + # WARNING: Don't set this unless you know what you're doing! + # Again, do NOT use this for configuring context length, use max_seq_len above ^ + override_base_seq_len: + + # Load model with tensor parallelism. + # Falls back to autosplit if GPU split isn't provided. + # This ignores the gpu_split_auto value. + tensor_parallel: False + + # Automatically allocate resources to GPUs (default: True). + # Not parsed for single GPU users. + gpu_split_auto: True + + # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0). + # Represented as an array of MB per GPU. + autosplit_reserve: [96] + + # An integer array of GBs of VRAM to split between GPUs (default: []). + # Used with tensor parallelism. + gpu_split: [] + + # Rope scale (default: 1.0). + # Same as compress_pos_emb. + # Use if the model was trained on long context with rope. + # Leave blank to pull the value from the model. + rope_scale: 1.0 + + # Rope alpha (default: 1.0). + # Same as alpha_value. Set to "auto" to auto-calculate. + rope_alpha: 1.0 + + # Enable different cache modes for VRAM savings (default: FP16). + # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. + cache_mode: FP16 + + # Size of the prompt cache to allocate (default: max_seq_len). + # Must be a multiple of 256 and can't be less than max_seq_len. + # For CFG, set this to 2 * max_seq_len. + cache_size: + + # Chunk size for prompt ingestion (default: 2048). + # A lower value reduces VRAM usage but decreases ingestion speed. + # NOTE: Effects vary depending on the model. + # An ideal value is between 512 and 4096. + chunk_size: 2048 + + # Set the maximum number of prompts to process at one time (default: None/Automatic). + # Automatically calculated if left blank. + # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. + max_batch_size: + + # Set the prompt template for this model. (default: None) + # If empty, attempts to look for the model's chat template. + # If a model contains multiple templates in its tokenizer_config.json, + # set prompt_template to the name of the template you want to use. + # NOTE: Only works with chat completion message lists! + prompt_template: + + # Number of experts to use per token. + # Fetched from the model's config.json if empty. + # NOTE: For MoE models only. + # WARNING: Don't set this unless you know what you're doing! + num_experts_per_token: + + # Enables fasttensors to possibly increase model loading speeds (default: False). + fasttensors: False + +# Options for draft models (speculative decoding) +# This will use more VRAM! +draft_model: + # Directory to look for draft models (default: models) + draft_model_dir: models + + # An initial draft model to load. + # Ensure the model is in the model directory. + draft_model_name: + + # Rope scale for draft models (default: 1.0). + # Same as compress_pos_emb. + # Use if the draft model was trained on long context with rope. + draft_rope_scale: 1.0 + + # Rope alpha for draft models (default: None). + # Same as alpha_value. Set to "auto" to auto-calculate. + draft_rope_alpha: + + # Cache mode for draft models to save VRAM (default: FP16). + # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. + draft_cache_mode: FP16 + +# Options for Loras +lora: + # Directory to look for LoRAs (default: loras). + lora_dir: loras + + # List of LoRAs to load and associated scaling factors (default scale: 1.0). + # For the YAML file, add each entry as a YAML list: + # - name: lora1 + # scaling: 1.0 + loras: + +# Options for embedding models and loading. +# NOTE: Embeddings requires the "extras" feature to be installed +# Install it via "pip install .[extras]" +embeddings: + # Directory to look for embedding models (default: models). + embedding_model_dir: models + + # Device to load embedding models on (default: cpu). + # Possible values: cpu, auto, cuda. + # NOTE: It's recommended to load embedding models on the CPU. + # If using an AMD GPU, set this value to 'cuda'. + embeddings_device: cpu + + # An initial embedding model to load on the infinity backend. + embedding_model_name: + +# Options for Sampling +sampling: + # Select a sampler override preset (default: None). + # Find this in the sampler-overrides folder. + # This overrides default fallbacks for sampler values that are passed to the API. + override_preset: + +# Options for development and experimentation +developer: + # Skip Exllamav2 version check (default: False). + # WARNING: It's highly recommended to update your dependencies rather than enabling this flag. + unsafe_launch: False + + # Disable API request streaming (default: False). + disable_request_streaming: False + + # Enable the torch CUDA malloc backend (default: False). + cuda_malloc_backend: False + + # Run asyncio using Uvloop or Winloop which can improve performance. + # NOTE: It's recommended to enable this, but if something breaks turn this off. + uvloop: False + + # Set process to use a higher priority. + # For realtime process priority, run as administrator or sudo. + # Otherwise, the priority will be set to high. + realtime_process_priority: False diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index eb2d431..b169162 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -6,7 +6,6 @@ from typing import List, Literal, Optional, Union from common.config_models import LoggingConfig from common.tabby_config import config -from common.utils import unwrap class ModelCardParameters(BaseModel):