diff --git a/common/config_models.py b/common/config_models.py index 2892b2c..6aac505 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -333,12 +333,8 @@ class DraftModelConfig(BaseConfigModel): class LoraInstanceModel(BaseConfigModel): """Model representing an instance of a Lora.""" - name: Optional[str] = Field(None, description=("Name of the LoRA model.")) - scaling: float = Field( - 1.0, - description=("Scaling factor for the LoRA model (default: 1.0)."), - ge=0, - ) + name: Optional[str] = None + scaling: float = Field(1.0, ge=0) class LoraConfig(BaseConfigModel): diff --git a/common/tabby_config.py b/common/tabby_config.py index 283dd17..0bf3563 100644 --- a/common/tabby_config.py +++ b/common/tabby_config.py @@ -2,15 +2,15 @@ import pathlib from inspect import getdoc from os import getenv from textwrap import dedent -from typing import Any, Optional +from typing import Optional from loguru import logger from pydantic import BaseModel -from pydantic_core import PydanticUndefined from ruamel.yaml import YAML from ruamel.yaml.comments import CommentedMap, CommentedSeq +from ruamel.yaml.scalarstring import PreservedScalarString -from common.config_models import TabbyConfigModel +from common.config_models import BaseConfigModel, TabbyConfigModel from common.utils import merge_dicts, unwrap yaml = YAML() @@ -174,22 +174,10 @@ config: TabbyConfig = TabbyConfig() def generate_config_file( model: BaseModel = None, filename: str = "config_sample.yml", - indentation: int = 2, ) -> None: """Creates a config.yml file from Pydantic models.""" schema = unwrap(model, TabbyConfigModel()) - preamble = get_preamble() - - yaml_content = pydantic_model_to_yaml(schema) - - with open(filename, "w") as f: - f.write(preamble) - yaml.dump(yaml_content, f) - - -def get_preamble() -> str: - """Returns the cleaned up preamble for the config file.""" preamble = """ # Sample YAML file for configuration. # Comment and uncomment values as needed. @@ -199,43 +187,80 @@ def get_preamble() -> str: # Unless specified in the comments, DO NOT put these options in quotes! # You can use https://www.yamllint.com/ if you want to check your YAML formatting.\n """ - return dedent(preamble).lstrip() + + yaml_content = pydantic_model_to_yaml(schema) + + with open(filename, "w") as f: + f.write(dedent(preamble).lstrip()) + yaml.dump(yaml_content, f) -# Function to convert pydantic model to dict with field descriptions as comments -def pydantic_model_to_yaml(model: BaseModel) -> CommentedMap: +def pydantic_model_to_yaml(model: BaseModel, indentation: int = 0) -> CommentedMap: """ Recursively converts a Pydantic model into a CommentedMap, with descriptions as comments in YAML. """ + # Create a CommentedMap to hold the output data yaml_data = CommentedMap() # Loop through all fields in the model + iteration = 1 for field_name, field_info in model.model_fields.items(): + # Get the inner pydantic model value = getattr(model, field_name) - # If the field is another Pydantic model - if isinstance(value, BaseModel): - yaml_data[field_name] = pydantic_model_to_yaml(value) - # If the field is a list of Pydantic models - elif ( - isinstance(value, list) - and len(value) > 0 - and isinstance(value[0], BaseModel) - ): - yaml_list = CommentedSeq() - for item in value: - yaml_list.append(pydantic_model_to_yaml(item)) - yaml_data[field_name] = yaml_list - # Otherwise, just assign the value - else: - yaml_data[field_name] = value + if isinstance(value, BaseConfigModel): + # If the field is another Pydantic model + + if not value._metadata.include_in_config: + continue + + yaml_data[field_name] = pydantic_model_to_yaml( + value, indentation=indentation + 2 + ) + comment = getdoc(value) + elif isinstance(value, list) and len(value) > 0: + # If the field is a list + + yaml_list = CommentedSeq() + if isinstance(value[0], BaseModel): + # If the field is a list of Pydantic models + # Do not add comments for these items + + for item in value: + yaml_list.append( + pydantic_model_to_yaml(item, indentation=indentation + 2) + ) + else: + # If the field is a normal list, prefer the YAML flow style + + yaml_list.fa.set_flow_style() + yaml_list += [ + PreservedScalarString(element) + if isinstance(element, str) + else element + for element in value + ] + + yaml_data[field_name] = yaml_list + comment = field_info.description + else: + # Otherwise, just assign the value + + yaml_data[field_name] = value + comment = field_info.description + + if comment: + # Add a newline to every comment but the first one + if iteration != 1: + comment = f"\n{comment}" - # Add field description as a comment if available - if field_info.description: yaml_data.yaml_set_comment_before_after_key( - field_name, before=field_info.description + field_name, before=comment, indent=indentation ) + # Increment the iteration counter + iteration += 1 + return yaml_data diff --git a/config_sample.yml b/config_sample.yml index 10e812f..0da6a36 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -18,27 +18,27 @@ network: # Disable HTTP token authentication with requests. # WARNING: This will make your instance vulnerable! # Turn on this option if you are ONLY connecting from localhost. - disable_auth: False + disable_auth: false # Send tracebacks over the API (default: False). # NOTE: Only enable this for debug purposes. - send_tracebacks: False + send_tracebacks: false # Select API servers to enable (default: ["OAI"]). # Possible values: OAI, Kobold. - api_servers: ['OAI'] + api_servers: ["OAI"] # Options for logging logging: # Enable prompt logging (default: False). - log_prompt: False + log_prompt: false # Enable generation parameter logging (default: False). - log_generation_params: False + log_generation_params: false # Enable request logging (default: False). # NOTE: Only use this for debugging! - log_requests: False + log_requests: false # Options for model overrides and loading # Please read the comments to understand how arguments are handled @@ -49,16 +49,16 @@ model: model_dir: models # Allow direct loading of models from a completion or chat completion request (default: False). - inline_model_loading: False + inline_model_loading: false # Sends dummy model names when the models endpoint is queried. # Enable this if the client is looking for specific OAI models. - use_dummy_models: False + use_dummy_models: false # An initial model to load. # Make sure the model is located in the model directory! # REQUIRED: This must be filled out to load a model on startup. - model_name: + model_name: # Names of args to use as a fallback for API load requests (default: []). # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array. @@ -67,21 +67,21 @@ model: # Max sequence length (default: Empty). # Fetched from the model's base sequence length in config.json by default. - max_seq_len: + max_seq_len: # Overrides base model context length (default: Empty). # WARNING: Don't set this unless you know what you're doing! # Again, do NOT use this for configuring context length, use max_seq_len above ^ - override_base_seq_len: + override_base_seq_len: # Load model with tensor parallelism. # Falls back to autosplit if GPU split isn't provided. # This ignores the gpu_split_auto value. - tensor_parallel: False + tensor_parallel: false # Automatically allocate resources to GPUs (default: True). # Not parsed for single GPU users. - gpu_split_auto: True + gpu_split_auto: true # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0). # Represented as an array of MB per GPU. @@ -108,7 +108,7 @@ model: # Size of the prompt cache to allocate (default: max_seq_len). # Must be a multiple of 256 and can't be less than max_seq_len. # For CFG, set this to 2 * max_seq_len. - cache_size: + cache_size: # Chunk size for prompt ingestion (default: 2048). # A lower value reduces VRAM usage but decreases ingestion speed. @@ -119,23 +119,23 @@ model: # Set the maximum number of prompts to process at one time (default: None/Automatic). # Automatically calculated if left blank. # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. - max_batch_size: + max_batch_size: # Set the prompt template for this model. (default: None) # If empty, attempts to look for the model's chat template. # If a model contains multiple templates in its tokenizer_config.json, # set prompt_template to the name of the template you want to use. # NOTE: Only works with chat completion message lists! - prompt_template: + prompt_template: # Number of experts to use per token. # Fetched from the model's config.json if empty. # NOTE: For MoE models only. # WARNING: Don't set this unless you know what you're doing! - num_experts_per_token: + num_experts_per_token: # Enables fasttensors to possibly increase model loading speeds (default: False). - fasttensors: False + fasttensors: false # Options for draft models (speculative decoding) # This will use more VRAM! @@ -145,7 +145,7 @@ draft_model: # An initial draft model to load. # Ensure the model is in the model directory. - draft_model_name: + draft_model_name: # Rope scale for draft models (default: 1.0). # Same as compress_pos_emb. @@ -154,7 +154,7 @@ draft_model: # Rope alpha for draft models (default: None). # Same as alpha_value. Set to "auto" to auto-calculate. - draft_rope_alpha: + draft_rope_alpha: # Cache mode for draft models to save VRAM (default: FP16). # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. @@ -169,7 +169,7 @@ lora: # For the YAML file, add each entry as a YAML list: # - name: lora1 # scaling: 1.0 - loras: + loras: # Options for embedding models and loading. # NOTE: Embeddings requires the "extras" feature to be installed @@ -185,32 +185,32 @@ embeddings: embeddings_device: cpu # An initial embedding model to load on the infinity backend. - embedding_model_name: + embedding_model_name: # Options for Sampling sampling: # Select a sampler override preset (default: None). # Find this in the sampler-overrides folder. # This overrides default fallbacks for sampler values that are passed to the API. - override_preset: + override_preset: # Options for development and experimentation developer: # Skip Exllamav2 version check (default: False). # WARNING: It's highly recommended to update your dependencies rather than enabling this flag. - unsafe_launch: False + unsafe_launch: false # Disable API request streaming (default: False). - disable_request_streaming: False + disable_request_streaming: false # Enable the torch CUDA malloc backend (default: False). - cuda_malloc_backend: False + cuda_malloc_backend: false # Run asyncio using Uvloop or Winloop which can improve performance. # NOTE: It's recommended to enable this, but if something breaks turn this off. - uvloop: False + uvloop: false # Set process to use a higher priority. # For realtime process priority, run as administrator or sudo. # Otherwise, the priority will be set to high. - realtime_process_priority: False + realtime_process_priority: false