From 7fe0dbd62f7a8d66d3ca5eb7f8ade52d98e91c98 Mon Sep 17 00:00:00 2001 From: kingbri Date: Mon, 16 Sep 2024 23:32:54 -0400 Subject: [PATCH] Tree: Update config_sample Uses the new YAML generator. Signed-off-by: kingbri --- config_sample.yml | 312 ++++++++++++++++++++++------------------------ 1 file changed, 148 insertions(+), 164 deletions(-) diff --git a/config_sample.yml b/config_sample.yml index 3b4f247..bd790db 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -1,5 +1,6 @@ # Sample YAML file for configuration. -# Comment and uncomment values as needed. Every value has a default within the application. +# Comment and uncomment values as needed. +# Every value has a default within the application. # This file serves to be a drop in for config.yml # Unless specified in the comments, DO NOT put these options in quotes! @@ -8,225 +9,208 @@ # Options for networking network: # The IP to host on (default: 127.0.0.1). - # Use 0.0.0.0 to expose on all network adapters + # Use 0.0.0.0 to expose on all network adapters. host: 127.0.0.1 - # The port to host on (default: 5000) + # The port to host on (default: 5000). port: 5000 - # Disable HTTP token authenticaion with requests + # Disable HTTP token authentication with requests. # WARNING: This will make your instance vulnerable! - # Turn on this option if you are ONLY connecting from localhost + # Turn on this option if you are ONLY connecting from localhost. disable_auth: False - # Send tracebacks over the API to clients (default: False) - # NOTE: Only enable this for debug purposes + # Send tracebacks over the API (default: False). + # NOTE: Only enable this for debug purposes. send_tracebacks: False - # Select API servers to enable (default: ["OAI"]) - # Possible values: OAI - api_servers: ["OAI"] + # Select API servers to enable (default: ["OAI"]). + # Possible values: OAI, Kobold. + api_servers: [] # Options for logging logging: - # Enable prompt logging (default: False) - prompt: False + # Enable prompt logging (default: False). + log_prompt: False - # Enable generation parameter logging (default: False) - generation_params: False + # Enable generation parameter logging (default: False). + log_generation_params: False - # Enable request logging (default: False) + # Enable request logging (default: False). # NOTE: Only use this for debugging! - requests: False - -# Options for sampling -sampling: - # Override preset name. Find this in the sampler-overrides folder (default: None) - # This overrides default fallbacks for sampler values that are passed to the API - # Server-side overrides are NOT needed by default - # WARNING: Using this can result in a generation speed penalty - #override_preset: - -# Options for development and experimentation -developer: - # Skips exllamav2 version check (default: False) - # It's highly recommended to update your dependencies rather than enabling this flag - # WARNING: Don't set this unless you know what you're doing! - #unsafe_launch: False - - # Disable all request streaming (default: False) - # A kill switch for turning off SSE in the API server - #disable_request_streaming: False - - # Enable the torch CUDA malloc backend (default: False) - # This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk. - #cuda_malloc_backend: False - - # Enable Uvloop or Winloop (default: False) - # Make the program utilize a faster async event loop which can improve performance - # NOTE: It's recommended to enable this, but if something breaks, turn this off. - #uvloop: False - - # Set process to use a higher priority - # For realtime process priority, run as administrator or sudo - # Otherwise, the priority will be set to high - #realtime_process_priority: False + log_requests: False # Options for model overrides and loading -# Please read the comments to understand how arguments are handled between initial and API loads +# Please read the comments to understand how arguments are handled +# between initial and API loads model: - # Overrides the directory to look for models (default: models) - # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise. + # Directory to look for models (default: models). + # Windows users, do NOT put this path in quotes! model_dir: models - # Sends dummy model names when the models endpoint is queried - # Enable this if the program is looking for a specific OAI model - #use_dummy_models: False - - # Allow direct loading of models from a completion or chat completion request + # Allow direct loading of models from a completion or chat completion request (default: False). inline_model_loading: False - # An initial model to load. Make sure the model is located in the model directory! - # A model can be loaded later via the API. - # REQUIRED: This must be filled out to load a model on startup! - model_name: + # Sends dummy model names when the models endpoint is queried. + # Enable this if the client is looking for specific OAI models. + use_dummy_models: False - # The below parameters only apply for initial loads - # All API based loads do NOT inherit these settings unless specified in use_as_default + # An initial model to load. + # Make sure the model is located in the model directory! + # REQUIRED: This must be filled out to load a model on startup. + model_name: - # Names of args to use as a default fallback for API load requests (default: []) - # For example, if you always want cache_mode to be Q4 instead of on the inital model load, - # Add "cache_mode" to this array - # Ex. ["max_seq_len", "cache_mode"] - #use_as_default: [] + # Names of args to use as a fallback for API load requests (default: []). + # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array. + # Example: ['max_seq_len', 'cache_mode']. + use_as_default: [] - # The below parameters apply only if model_name is set + # Max sequence length (default: Empty). + # Fetched from the model's base sequence length in config.json by default. + max_seq_len: - # Max sequence length (default: Empty) - # Fetched from the model's base sequence length in config.json by default - #max_seq_len: - - # Overrides base model context length (default: Empty) + # Overrides base model context length (default: Empty). # WARNING: Don't set this unless you know what you're doing! # Again, do NOT use this for configuring context length, use max_seq_len above ^ - # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B) - #override_base_seq_len: + override_base_seq_len: - # Load model with tensor parallelism - # If a GPU split isn't provided, the TP loader will fallback to autosplit - # Enabling ignores the gpu_split_auto and autosplit_reserve values - #tensor_parallel: False + # Load model with tensor parallelism. + # Falls back to autosplit if GPU split isn't provided. + # This ignores the gpu_split_auto value. + tensor_parallel: False - # Automatically allocate resources to GPUs (default: True) - # NOTE: Not parsed for single GPU users - #gpu_split_auto: True + # Automatically allocate resources to GPUs (default: True). + # Not parsed for single GPU users. + gpu_split_auto: True - # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0) - # This is represented as an array of MB per GPU used - #autosplit_reserve: [96] + # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0). + # Represented as an array of MB per GPU. + autosplit_reserve: [96] - # An integer array of GBs of vram to split between GPUs (default: []) - # Used with tensor parallelism - # NOTE: Not parsed for single GPU users - #gpu_split: [20.6, 24] + # An integer array of GBs of VRAM to split between GPUs (default: []). + # Used with tensor parallelism. + gpu_split: [] - # Rope scale (default: 1.0) - # Same thing as compress_pos_emb - # Only use if your model was trained on long context with rope (check config.json) - # Leave blank to pull the value from the model - #rope_scale: 1.0 + # Rope scale (default: 1.0). + # Same as compress_pos_emb. + # Use if the model was trained on long context with rope. + # Leave blank to pull the value from the model. + rope_scale: 1.0 - # Rope alpha (default: 1.0) - # Same thing as alpha_value - # Set to "auto" to automatically calculate - # Leave blank to pull the value from the model - #rope_alpha: 1.0 + # Rope alpha (default: 1.0). + # Same as alpha_value. Set to "auto" to auto-calculate. + rope_alpha: 1.0 - # Enable different cache modes for VRAM savings (slight performance hit). - # Possible values FP16, Q8, Q6, Q4. (default: FP16) - #cache_mode: FP16 + # Enable different cache modes for VRAM savings (default: FP16). + # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. + cache_mode: FP16 - # Size of the prompt cache to allocate (default: max_seq_len) - # This must be a multiple of 256. A larger cache uses more VRAM, but allows for more prompts to be processed at once. - # NOTE: Cache size should not be less than max_seq_len. - # For CFG, set this to 2 * max_seq_len to make room for both positive and negative prompts. - #cache_size: + # Size of the prompt cache to allocate (default: max_seq_len). + # Must be a multiple of 256 and can't be less than max_seq_len. + # For CFG, set this to 2 * max_seq_len. + cache_size: - # Chunk size for prompt ingestion. A lower value reduces VRAM usage at the cost of ingestion speed (default: 2048) - # NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096 - #chunk_size: 2048 + # Chunk size for prompt ingestion (default: 2048). + # A lower value reduces VRAM usage but decreases ingestion speed. + # NOTE: Effects vary depending on the model. + # An ideal value is between 512 and 4096. + chunk_size: 2048 - # Set the maximum amount of prompts to process at one time (default: None/Automatic) - # This will be automatically calculated if left blank. - # A max batch size of 1 processes prompts one at a time. - # NOTE: Only available for Nvidia ampere (30 series) and above GPUs - #max_batch_size: + # Set the maximum number of prompts to process at one time (default: None/Automatic). + # Automatically calculated if left blank. + # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. + max_batch_size: - # Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None) - # If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name - # of the template you want to use. + # Set the prompt template for this model. (default: None) + # If empty, attempts to look for the model's chat template. + # If a model contains multiple templates in its tokenizer_config.json, + # set prompt_template to the name of the template you want to use. # NOTE: Only works with chat completion message lists! - #prompt_template: + prompt_template: - # Number of experts to use PER TOKEN. Fetched from the model's config.json if not specified (default: Empty) + # Number of experts to use per token. + # Fetched from the model's config.json if empty. + # NOTE: For MoE models only. # WARNING: Don't set this unless you know what you're doing! - # NOTE: For MoE models (ex. Mixtral) only! - #num_experts_per_token: + num_experts_per_token: - # Enables fasttensors to possibly increase model loading speeds (default: False) - #fasttensors: true + # Enables fasttensors to possibly increase model loading speeds (default: False). + fasttensors: False - # Options for draft models (speculative decoding). This will use more VRAM! - #draft: - # Overrides the directory to look for draft (default: models) - #draft_model_dir: models +# Options for draft models (speculative decoding) +# This will use more VRAM! +draft_model: + # Directory to look for draft models (default: models) + draft_model_dir: models - # An initial draft model to load. Make sure this model is located in the model directory! - # A draft model can be loaded later via the API. - #draft_model_name: A model name - - # The below parameters only apply for initial loads - # All API based loads do NOT inherit these settings unless specified in use_as_default + # An initial draft model to load. + # Ensure the model is in the model directory. + draft_model_name: - # Rope scale for draft models (default: 1.0) - # Same thing as compress_pos_emb - # Only use if your draft model was trained on long context with rope (check config.json) - #draft_rope_scale: 1.0 + # Rope scale for draft models (default: 1.0). + # Same as compress_pos_emb. + # Use if the draft model was trained on long context with rope. + draft_rope_scale: 1.0 - # Rope alpha for draft model (default: 1.0) - # Same thing as alpha_value - # Leave blank to automatically calculate alpha value - #draft_rope_alpha: 1.0 + # Rope alpha for draft models (default: None). + # Same as alpha_value. Set to "auto" to auto-calculate. + draft_rope_alpha: - # Enable different draft model cache modes for VRAM savings (slight performance hit). - # Possible values FP16, Q8, Q6, Q4. (default: FP16) - #draft_cache_mode: FP16 - - # Options for loras - #lora: - # Overrides the directory to look for loras (default: loras) - #lora_dir: loras - - # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed. - #loras: - #- name: lora1 - # scaling: 1.0 + # Cache mode for draft models to save VRAM (default: FP16). + # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. + draft_cache_mode: FP16 + +# Options for Loras +lora: + # Directory to look for LoRAs (default: loras). + lora_dir: loras + + # List of LoRAs to load and associated scaling factors (default scale: 1.0). + # For the YAML file, add each entry as a YAML list: + # - name: lora1 + # scaling: 1.0 + loras: # Options for embedding models and loading. # NOTE: Embeddings requires the "extras" feature to be installed # Install it via "pip install .[extras]" embeddings: - # Overrides directory to look for embedding models (default: models) + # Directory to look for embedding models (default: models). embedding_model_dir: models - # Device to load embedding models on (default: cpu) - # Possible values: cpu, auto, cuda + # Device to load embedding models on (default: cpu). + # Possible values: cpu, auto, cuda. # NOTE: It's recommended to load embedding models on the CPU. - # If you'd like to load on an AMD gpu, set this value to "cuda" as well. + # If using an AMD GPU, set this value to 'cuda'. embeddings_device: cpu - # The below parameters only apply for initial loads - # All API based loads do NOT inherit these settings unless specified in use_as_default + # An initial embedding model to load on the infinity backend. + embedding_model_name: - # An initial embedding model to load on the infinity backend (default: None) - embedding_model_name: +# Options for Sampling +sampling: + # Select a sampler override preset (default: None). + # Find this in the sampler-overrides folder. + # This overrides default fallbacks for sampler values that are passed to the API. + override_preset: + +# Options for development and experimentation +developer: + # Skip Exllamav2 version check (default: False). + # WARNING: It's highly recommended to update your dependencies rather than enabling this flag. + unsafe_launch: False + + # Disable API request streaming (default: False). + disable_request_streaming: False + + # Enable the torch CUDA malloc backend (default: False). + cuda_malloc_backend: False + + # Run asyncio using Uvloop or Winloop which can improve performance. + # NOTE: It's recommended to enable this, but if something breaks turn this off. + uvloop: False + + # Set process to use a higher priority. + # For realtime process priority, run as administrator or sudo. + # Otherwise, the priority will be set to high. + realtime_process_priority: False