from pathlib import Path from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from typing import List, Literal, Optional, Union CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"] class Metadata(BaseModel): """metadata model for config options""" include_in_config: Optional[bool] = Field(True) class BaseConfigModel(BaseModel): """Base model for config models with added metadata""" _metadata: Metadata = PrivateAttr(Metadata()) class ConfigOverrideConfig(BaseConfigModel): """Model for overriding a provided config file.""" # TODO: convert this to a pathlib.path? config: Optional[str] = Field( None, description=("Path to an overriding config.yml file") ) _metadata: Metadata = PrivateAttr(Metadata(include_in_config=False)) class UtilityActions(BaseConfigModel): """Model used for arg actions.""" # YAML export options export_config: Optional[str] = Field( None, description="generate a template config file" ) config_export_path: Optional[Path] = Field( "config_sample.yml", description="path to export configuration file to" ) # OpenAPI JSON export options export_openapi: Optional[bool] = Field( False, description="export openapi schema files" ) openapi_export_path: Optional[Path] = Field( "openapi.json", description="path to export openapi schema to" ) _metadata: Metadata = PrivateAttr(Metadata(include_in_config=False)) class NetworkConfig(BaseConfigModel): """Options for networking""" host: Optional[str] = Field( "127.0.0.1", description=( "The IP to host on (default: 127.0.0.1).\n" "Use 0.0.0.0 to expose on all network adapters." ), ) port: Optional[int] = Field( 5000, description=("The port to host on (default: 5000).") ) disable_auth: Optional[bool] = Field( False, description=( "Disable HTTP token authentication with requests.\n" "WARNING: This will make your instance vulnerable!\n" "Turn on this option if you are ONLY connecting from localhost." ), ) send_tracebacks: Optional[bool] = Field( False, description=( "Send tracebacks over the API (default: False).\n" "NOTE: Only enable this for debug purposes." ), ) api_servers: Optional[List[Literal["OAI", "Kobold"]]] = Field( default_factory=list, description=( 'Select API servers to enable (default: ["OAI"]).\n' "Possible values: OAI, Kobold." ), ) # TODO: Migrate config.yml to have the log_ prefix # This is a breaking change. class LoggingConfig(BaseConfigModel): """Options for logging""" log_prompt: Optional[bool] = Field( False, description=("Enable prompt logging (default: False)."), ) log_generation_params: Optional[bool] = Field( False, description=("Enable generation parameter logging (default: False)."), ) log_requests: Optional[bool] = Field( False, description=( "Enable request logging (default: False).\n" "NOTE: Only use this for debugging!" ), ) class ModelConfig(BaseConfigModel): """ Options for model overrides and loading Please read the comments to understand how arguments are handled between initial and API loads """ # TODO: convert this to a pathlib.path? model_dir: str = Field( "models", description=( "Directory to look for models (default: models).\n" "Windows users, do NOT put this path in quotes!" ), ) inline_model_loading: Optional[bool] = Field( True, description=( "Allow direct loading of models " "from a completion or chat completion request (default: False)." ), ) use_dummy_models: Optional[bool] = Field( False, description=( "Sends dummy model names when the models endpoint is queried.\n" "Enable this if the client is looking for specific OAI models." ), ) model_name: Optional[str] = Field( None, description=( "An initial model to load.\n" "Make sure the model is located in the model directory!\n" "REQUIRED: This must be filled out to load a model on startup." ), ) use_as_default: List[str] = Field( default_factory=list, description=( "Names of args to use as a fallback for API load requests (default: []).\n" "For example, if you always want cache_mode to be Q4 " 'instead of on the inital model load, add "cache_mode" to this array.\n' "Example: ['max_seq_len', 'cache_mode']." ), ) max_seq_len: Optional[int] = Field( None, description=( "Max sequence length (default: Empty).\n" "Fetched from the model's base sequence length in config.json by default." ), ge=0, ) override_base_seq_len: Optional[int] = Field( None, description=( "Overrides base model context length (default: Empty).\n" "WARNING: Don't set this unless you know what you're doing!\n" "Again, do NOT use this for configuring context length, " "use max_seq_len above ^" ), ge=0, ) tensor_parallel: Optional[bool] = Field( False, description=( "Load model with tensor parallelism.\n" "Falls back to autosplit if GPU split isn't provided.\n" "This ignores the gpu_split_auto value." ), ) gpu_split_auto: Optional[bool] = Field( True, description=( "Automatically allocate resources to GPUs (default: True).\n" "Not parsed for single GPU users." ), ) autosplit_reserve: List[int] = Field( [96], description=( "Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n" "Represented as an array of MB per GPU." ), ) gpu_split: List[float] = Field( default_factory=list, description=( "An integer array of GBs of VRAM to split between GPUs (default: []).\n" "Used with tensor parallelism." ), ) rope_scale: Optional[float] = Field( 1.0, description=( "Rope scale (default: 1.0).\n" "Same as compress_pos_emb.\n" "Use if the model was trained on long context with rope.\n" "Leave blank to pull the value from the model." ), ) rope_alpha: Optional[Union[float, Literal["auto"]]] = Field( 1.0, description=( "Rope alpha (default: 1.0).\n" 'Same as alpha_value. Set to "auto" to auto-calculate.' ), ) cache_mode: Optional[CACHE_SIZES] = Field( "FP16", description=( "Enable different cache modes for VRAM savings (default: FP16).\n" f"Possible values: {str(CACHE_SIZES)[15:-1]}." ), ) cache_size: Optional[int] = Field( None, description=( "Size of the prompt cache to allocate (default: max_seq_len).\n" "Must be a multiple of 256 and can't be less than max_seq_len.\n" "For CFG, set this to 2 * max_seq_len." ), multiple_of=256, gt=0, ) chunk_size: Optional[int] = Field( 2048, description=( "Chunk size for prompt ingestion (default: 2048).\n" "A lower value reduces VRAM usage but decreases ingestion speed.\n" "NOTE: Effects vary depending on the model.\n" "An ideal value is between 512 and 4096." ), gt=0, ) max_batch_size: Optional[int] = Field( None, description=( "Set the maximum number of prompts to process at one time " "(default: None/Automatic).\n" "Automatically calculated if left blank.\n" "NOTE: Only available for Nvidia ampere (30 series) and above GPUs." ), ge=1, ) prompt_template: Optional[str] = Field( None, description=( "Set the prompt template for this model. (default: None)\n" "If empty, attempts to look for the model's chat template.\n" "If a model contains multiple templates in its tokenizer_config.json,\n" "set prompt_template to the name of the template you want to use.\n" "NOTE: Only works with chat completion message lists!" ), ) num_experts_per_token: Optional[int] = Field( None, description=( "Number of experts to use per token.\n" "Fetched from the model's config.json if empty.\n" "NOTE: For MoE models only.\n" "WARNING: Don't set this unless you know what you're doing!" ), ge=1, ) fasttensors: Optional[bool] = Field( False, description=( "Enables fasttensors to possibly increase model loading speeds " "(default: False)." ), ) _metadata: Metadata = PrivateAttr(Metadata()) model_config = ConfigDict(protected_namespaces=()) class DraftModelConfig(BaseConfigModel): """ Options for draft models (speculative decoding) This will use more VRAM! """ # TODO: convert this to a pathlib.path? draft_model_dir: Optional[str] = Field( "models", description=("Directory to look for draft models (default: models)"), ) draft_model_name: Optional[str] = Field( None, description=( "An initial draft model to load.\n" "Ensure the model is in the model directory." ), ) draft_rope_scale: Optional[float] = Field( 1.0, description=( "Rope scale for draft models (default: 1.0).\n" "Same as compress_pos_emb.\n" "Use if the draft model was trained on long context with rope." ), ) draft_rope_alpha: Optional[float] = Field( None, description=( "Rope alpha for draft models (default: None).\n" 'Same as alpha_value. Set to "auto" to auto-calculate.' ), ) draft_cache_mode: Optional[CACHE_SIZES] = Field( "FP16", description=( "Cache mode for draft models to save VRAM (default: FP16).\n" f"Possible values: {str(CACHE_SIZES)[15:-1]}." ), ) class LoraInstanceModel(BaseConfigModel): """Model representing an instance of a Lora.""" name: str = Field(..., description=("Name of the LoRA model.")) scaling: float = Field( 1.0, description=("Scaling factor for the LoRA model (default: 1.0)."), ge=0, ) class LoraConfig(BaseConfigModel): """Options for Loras""" # TODO: convert this to a pathlib.path? lora_dir: Optional[str] = Field( "loras", description=("Directory to look for LoRAs (default: loras).") ) loras: Optional[List[LoraInstanceModel]] = Field( None, description=( "List of LoRAs to load and associated scaling factors " "(default scale: 1.0)." ), ) class EmbeddingsConfig(BaseConfigModel): """ Options for embedding models and loading. NOTE: Embeddings requires the "extras" feature to be installed Install it via "pip install .[extras]" """ # TODO: convert this to a pathlib.path? embedding_model_dir: Optional[str] = Field( "models", description=("Directory to look for embedding models (default: models)."), ) embeddings_device: Optional[Literal["cpu", "auto", "cuda"]] = Field( "cpu", description=( "Device to load embedding models on (default: cpu).\n" "Possible values: cpu, auto, cuda.\n" "NOTE: It's recommended to load embedding models on the CPU.\n" "If using an AMD GPU, set this value to 'cuda'." ), ) embedding_model_name: Optional[str] = Field( None, description=("An initial embedding model to load on the infinity backend."), ) class SamplingConfig(BaseConfigModel): """Options for Sampling""" override_preset: Optional[str] = Field( None, description=( "Select a sampler override preset (default: None).\n" "Find this in the sampler-overrides folder.\n" "This overrides default fallbacks for sampler values " "that are passed to the API." ), ) class DeveloperConfig(BaseConfigModel): """Options for development and experimentation""" unsafe_launch: Optional[bool] = Field( False, description=( "Skip Exllamav2 version check (default: False).\n" "WARNING: It's highly recommended to update your dependencies rather " "than enabling this flag." ), ) disable_request_streaming: Optional[bool] = Field( False, description=("Disable API request streaming (default: False).") ) cuda_malloc_backend: Optional[bool] = Field( False, description=("Enable the torch CUDA malloc backend (default: False).") ) uvloop: Optional[bool] = Field( False, description=( "Run asyncio using Uvloop or Winloop which can improve performance.\n" "NOTE: It's recommended to enable this, but if something breaks " "turn this off." ), ) realtime_process_priority: Optional[bool] = Field( False, description=( "Set process to use a higher priority.\n" "For realtime process priority, run as administrator or sudo.\n" "Otherwise, the priority will be set to high." ), ) class TabbyConfigModel(BaseModel): """Base model for a TabbyConfig.""" config: Optional[ConfigOverrideConfig] = Field( default_factory=ConfigOverrideConfig.model_construct ) network: Optional[NetworkConfig] = Field( default_factory=NetworkConfig.model_construct ) logging: Optional[LoggingConfig] = Field( default_factory=LoggingConfig.model_construct ) model: Optional[ModelConfig] = Field(default_factory=ModelConfig.model_construct) draft_model: Optional[DraftModelConfig] = Field( default_factory=DraftModelConfig.model_construct ) lora: Optional[LoraConfig] = Field(default_factory=LoraConfig.model_construct) embeddings: Optional[EmbeddingsConfig] = Field( default_factory=EmbeddingsConfig.model_construct ) sampling: Optional[SamplingConfig] = Field( default_factory=SamplingConfig.model_construct ) developer: Optional[DeveloperConfig] = Field( default_factory=DeveloperConfig.model_construct ) actions: Optional[UtilityActions] = Field( default_factory=UtilityActions.model_construct ) model_config = ConfigDict(validate_assignment=True, protected_namespaces=())