tabbyAPI-ollama/common/config_models.py
2024-11-19 21:19:21 -05:00

488 lines
16 KiB
Python

from pathlib import Path
from pydantic import (
BaseModel,
ConfigDict,
Field,
PrivateAttr,
field_validator,
)
from typing import List, Literal, Optional, Union
CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
class Metadata(BaseModel):
"""metadata model for config options"""
include_in_config: Optional[bool] = Field(True)
class BaseConfigModel(BaseModel):
"""Base model for config models with added metadata"""
_metadata: Metadata = PrivateAttr(Metadata())
class ConfigOverrideConfig(BaseConfigModel):
"""Model for overriding a provided config file."""
# TODO: convert this to a pathlib.path?
config: Optional[str] = Field(
None, description=("Path to an overriding config.yml file")
)
_metadata: Metadata = PrivateAttr(Metadata(include_in_config=False))
class UtilityActions(BaseConfigModel):
"""Model used for arg actions."""
# YAML export options
export_config: Optional[str] = Field(
None, description="generate a template config file"
)
config_export_path: Optional[Path] = Field(
"config_sample.yml", description="path to export configuration file to"
)
# OpenAPI JSON export options
export_openapi: Optional[bool] = Field(
False, description="export openapi schema files"
)
openapi_export_path: Optional[Path] = Field(
"openapi.json", description="path to export openapi schema to"
)
_metadata: Metadata = PrivateAttr(Metadata(include_in_config=False))
class NetworkConfig(BaseConfigModel):
"""Options for networking"""
host: Optional[str] = Field(
"127.0.0.1",
description=(
"The IP to host on (default: 127.0.0.1).\n"
"Use 0.0.0.0 to expose on all network adapters."
),
)
port: Optional[int] = Field(
5000, description=("The port to host on (default: 5000).")
)
disable_auth: Optional[bool] = Field(
False,
description=(
"Disable HTTP token authentication with requests.\n"
"WARNING: This will make your instance vulnerable!\n"
"Turn on this option if you are ONLY connecting from localhost."
),
)
disable_fetch_requests: Optional[bool] = Field(
False,
description=(
"Disable fetching external content in response to requests,"
"such as images from URLs."
),
)
send_tracebacks: Optional[bool] = Field(
False,
description=(
"Send tracebacks over the API (default: False).\n"
"NOTE: Only enable this for debug purposes."
),
)
api_servers: Optional[List[Literal["oai", "kobold"]]] = Field(
["OAI"],
description=(
'Select API servers to enable (default: ["OAI"]).\n'
"Possible values: OAI, Kobold."
),
)
# Converts all strings in the api_servers list to lowercase
# NOTE: Expand if more models need this validator
@field_validator("api_servers", mode="before")
def api_server_validator(cls, api_servers):
return [server_name.lower() for server_name in api_servers]
# TODO: Migrate config.yml to have the log_ prefix
# This is a breaking change.
class LoggingConfig(BaseConfigModel):
"""Options for logging"""
log_prompt: Optional[bool] = Field(
False,
description=("Enable prompt logging (default: False)."),
)
log_generation_params: Optional[bool] = Field(
False,
description=("Enable generation parameter logging (default: False)."),
)
log_requests: Optional[bool] = Field(
False,
description=(
"Enable request logging (default: False).\n"
"NOTE: Only use this for debugging!"
),
)
class ModelConfig(BaseConfigModel):
"""
Options for model overrides and loading
Please read the comments to understand how arguments are handled
between initial and API loads
"""
# TODO: convert this to a pathlib.path?
model_dir: str = Field(
"models",
description=(
"Directory to look for models (default: models).\n"
"Windows users, do NOT put this path in quotes!"
),
)
inline_model_loading: Optional[bool] = Field(
False,
description=(
"Allow direct loading of models "
"from a completion or chat completion request (default: False).\n"
"This method of loading is strict by default.\n"
"Enable dummy models to add exceptions for invalid model names."
),
)
use_dummy_models: Optional[bool] = Field(
False,
description=(
"Sends dummy model names when the models endpoint is queried. "
"(default: False)\n"
"Enable this if the client is looking for specific OAI models.\n"
),
)
dummy_model_names: List[str] = Field(
default=["gpt-3.5-turbo"],
description=(
"A list of fake model names that are sent via the /v1/models endpoint. "
'(default: ["gpt-3.5-turbo"])\n'
"Also used as bypasses for strict mode if inline_model_loading is true."
),
)
model_name: Optional[str] = Field(
None,
description=(
"An initial model to load.\n"
"Make sure the model is located in the model directory!\n"
"REQUIRED: This must be filled out to load a model on startup."
),
)
use_as_default: List[str] = Field(
default_factory=list,
description=(
"Names of args to use as a fallback for API load requests (default: []).\n"
"For example, if you always want cache_mode to be Q4 "
'instead of on the inital model load, add "cache_mode" to this array.\n'
"Example: ['max_seq_len', 'cache_mode']."
),
)
max_seq_len: Optional[int] = Field(
None,
description=(
"Max sequence length (default: Empty).\n"
"Fetched from the model's base sequence length in config.json by default."
),
ge=0,
)
tensor_parallel: Optional[bool] = Field(
False,
description=(
"Load model with tensor parallelism.\n"
"Falls back to autosplit if GPU split isn't provided.\n"
"This ignores the gpu_split_auto value."
),
)
gpu_split_auto: Optional[bool] = Field(
True,
description=(
"Automatically allocate resources to GPUs (default: True).\n"
"Not parsed for single GPU users."
),
)
autosplit_reserve: List[int] = Field(
[96],
description=(
"Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"
"Represented as an array of MB per GPU."
),
)
gpu_split: List[float] = Field(
default_factory=list,
description=(
"An integer array of GBs of VRAM to split between GPUs (default: []).\n"
"Used with tensor parallelism."
),
)
rope_scale: Optional[float] = Field(
1.0,
description=(
"Rope scale (default: 1.0).\n"
"Same as compress_pos_emb.\n"
"Use if the model was trained on long context with rope.\n"
"Leave blank to pull the value from the model."
),
)
rope_alpha: Optional[Union[float, Literal["auto"]]] = Field(
None,
description=(
"Rope alpha (default: None).\n"
'Same as alpha_value. Set to "auto" to auto-calculate.\n'
"Leaving this value blank will either pull from the model "
"or auto-calculate."
),
)
cache_mode: Optional[CACHE_SIZES] = Field(
"FP16",
description=(
"Enable different cache modes for VRAM savings (default: FP16).\n"
f"Possible values: {str(CACHE_SIZES)[15:-1]}."
),
)
cache_size: Optional[int] = Field(
None,
description=(
"Size of the prompt cache to allocate (default: max_seq_len).\n"
"Must be a multiple of 256 and can't be less than max_seq_len.\n"
"For CFG, set this to 2 * max_seq_len."
),
multiple_of=256,
gt=0,
)
chunk_size: Optional[int] = Field(
2048,
description=(
"Chunk size for prompt ingestion (default: 2048).\n"
"A lower value reduces VRAM usage but decreases ingestion speed.\n"
"NOTE: Effects vary depending on the model.\n"
"An ideal value is between 512 and 4096."
),
gt=0,
)
max_batch_size: Optional[int] = Field(
None,
description=(
"Set the maximum number of prompts to process at one time "
"(default: None/Automatic).\n"
"Automatically calculated if left blank.\n"
"NOTE: Only available for Nvidia ampere (30 series) and above GPUs."
),
ge=1,
)
prompt_template: Optional[str] = Field(
None,
description=(
"Set the prompt template for this model. (default: None)\n"
"If empty, attempts to look for the model's chat template.\n"
"If a model contains multiple templates in its tokenizer_config.json,\n"
"set prompt_template to the name of the template you want to use.\n"
"NOTE: Only works with chat completion message lists!"
),
)
vision: Optional[bool] = Field(
False,
description=(
"Enables vision support if the model supports it. (default: False)"
),
)
num_experts_per_token: Optional[int] = Field(
None,
description=(
"Number of experts to use per token.\n"
"Fetched from the model's config.json if empty.\n"
"NOTE: For MoE models only.\n"
"WARNING: Don't set this unless you know what you're doing!"
),
ge=1,
)
_metadata: Metadata = PrivateAttr(Metadata())
model_config = ConfigDict(protected_namespaces=())
class DraftModelConfig(BaseConfigModel):
"""
Options for draft models (speculative decoding)
This will use more VRAM!
"""
# TODO: convert this to a pathlib.path?
draft_model_dir: Optional[str] = Field(
"models",
description=("Directory to look for draft models (default: models)"),
)
draft_model_name: Optional[str] = Field(
None,
description=(
"An initial draft model to load.\n"
"Ensure the model is in the model directory."
),
)
draft_rope_scale: Optional[float] = Field(
1.0,
description=(
"Rope scale for draft models (default: 1.0).\n"
"Same as compress_pos_emb.\n"
"Use if the draft model was trained on long context with rope."
),
)
draft_rope_alpha: Optional[float] = Field(
None,
description=(
"Rope alpha for draft models (default: None).\n"
'Same as alpha_value. Set to "auto" to auto-calculate.\n'
"Leaving this value blank will either pull from the model "
"or auto-calculate."
),
)
draft_cache_mode: Optional[CACHE_SIZES] = Field(
"FP16",
description=(
"Cache mode for draft models to save VRAM (default: FP16).\n"
f"Possible values: {str(CACHE_SIZES)[15:-1]}."
),
)
class LoraInstanceModel(BaseConfigModel):
"""Model representing an instance of a Lora."""
name: Optional[str] = None
scaling: float = Field(1.0, ge=0)
class LoraConfig(BaseConfigModel):
"""Options for Loras"""
# TODO: convert this to a pathlib.path?
lora_dir: Optional[str] = Field(
"loras", description=("Directory to look for LoRAs (default: loras).")
)
loras: Optional[List[LoraInstanceModel]] = Field(
None,
description=(
"List of LoRAs to load and associated scaling factors "
"(default scale: 1.0).\n"
"For the YAML file, add each entry as a YAML list:\n"
"- name: lora1\n"
" scaling: 1.0"
),
)
class EmbeddingsConfig(BaseConfigModel):
"""
Options for embedding models and loading.
NOTE: Embeddings requires the "extras" feature to be installed
Install it via "pip install .[extras]"
"""
# TODO: convert this to a pathlib.path?
embedding_model_dir: Optional[str] = Field(
"models",
description=("Directory to look for embedding models (default: models)."),
)
embeddings_device: Optional[Literal["cpu", "auto", "cuda"]] = Field(
"cpu",
description=(
"Device to load embedding models on (default: cpu).\n"
"Possible values: cpu, auto, cuda.\n"
"NOTE: It's recommended to load embedding models on the CPU.\n"
"If using an AMD GPU, set this value to 'cuda'."
),
)
embedding_model_name: Optional[str] = Field(
None,
description=("An initial embedding model to load on the infinity backend."),
)
class SamplingConfig(BaseConfigModel):
"""Options for Sampling"""
override_preset: Optional[str] = Field(
None,
description=(
"Select a sampler override preset (default: None).\n"
"Find this in the sampler-overrides folder.\n"
"This overrides default fallbacks for sampler values "
"that are passed to the API."
),
)
class DeveloperConfig(BaseConfigModel):
"""Options for development and experimentation"""
unsafe_launch: Optional[bool] = Field(
False,
description=(
"Skip Exllamav2 version check (default: False).\n"
"WARNING: It's highly recommended to update your dependencies rather "
"than enabling this flag."
),
)
disable_request_streaming: Optional[bool] = Field(
False, description=("Disable API request streaming (default: False).")
)
cuda_malloc_backend: Optional[bool] = Field(
False, description=("Enable the torch CUDA malloc backend (default: False).")
)
uvloop: Optional[bool] = Field(
False,
description=(
"Run asyncio using Uvloop or Winloop which can improve performance.\n"
"NOTE: It's recommended to enable this, but if something breaks "
"turn this off."
),
)
realtime_process_priority: Optional[bool] = Field(
False,
description=(
"Set process to use a higher priority.\n"
"For realtime process priority, run as administrator or sudo.\n"
"Otherwise, the priority will be set to high."
),
)
class TabbyConfigModel(BaseModel):
"""Base model for a TabbyConfig."""
config: Optional[ConfigOverrideConfig] = Field(
default_factory=ConfigOverrideConfig.model_construct
)
network: Optional[NetworkConfig] = Field(
default_factory=NetworkConfig.model_construct
)
logging: Optional[LoggingConfig] = Field(
default_factory=LoggingConfig.model_construct
)
model: Optional[ModelConfig] = Field(default_factory=ModelConfig.model_construct)
draft_model: Optional[DraftModelConfig] = Field(
default_factory=DraftModelConfig.model_construct
)
lora: Optional[LoraConfig] = Field(default_factory=LoraConfig.model_construct)
embeddings: Optional[EmbeddingsConfig] = Field(
default_factory=EmbeddingsConfig.model_construct
)
sampling: Optional[SamplingConfig] = Field(
default_factory=SamplingConfig.model_construct
)
developer: Optional[DeveloperConfig] = Field(
default_factory=DeveloperConfig.model_construct
)
actions: Optional[UtilityActions] = Field(
default_factory=UtilityActions.model_construct
)
model_config = ConfigDict(validate_assignment=True, protected_namespaces=())