From b9e5693c1b53dce3043654cf20fff807475747eb Mon Sep 17 00:00:00 2001 From: kingbri Date: Tue, 10 Sep 2024 23:35:35 -0400 Subject: [PATCH] API + Model: Apply config.yml defaults for all load paths There are two ways to load a model: 1. Via the load endpoint 2. Inline with a completion The defaults were not applying on the inline load, so rewrite to fix that. However, while doing this, set up a defaults dictionary rather than comparing it at runtime and remove the pydantic default lambda on all the model load fields. This makes the code cleaner and establishes a clear config tree for loading models. Signed-off-by: kingbri --- common/model.py | 24 ++---------- common/tabby_config.py | 14 +++++++ endpoints/core/types/model.py | 70 ++++++++++++----------------------- 3 files changed, 41 insertions(+), 67 deletions(-) diff --git a/common/model.py b/common/model.py index a1f29b5..5fdfc5b 100644 --- a/common/model.py +++ b/common/model.py @@ -13,7 +13,6 @@ from typing import Optional from common.logger import get_loading_progress_bar from common.networking import handle_request_error from common.tabby_config import config -from common.utils import unwrap from endpoints.utils import do_export_openapi if not do_export_openapi: @@ -67,6 +66,10 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): logger.info("Unloading existing model.") await unload_model() + # Merge with config defaults + kwargs = {**config.model_defaults, **kwargs} + + # Create a new container container = await ExllamaV2Container.create(model_path.resolve(), False, **kwargs) model_type = "draft" if container.draft_config else "model" @@ -149,25 +152,6 @@ async def unload_embedding_model(): embeddings_container = None -# FIXME: Maybe make this a one-time function instead of a dynamic default -def get_config_default(key: str, model_type: str = "model"): - """Fetches a default value from model config if allowed by the user.""" - - default_keys = unwrap(config.model.get("use_as_default"), []) - - # Add extra keys to defaults - default_keys.append("embeddings_device") - - if key in default_keys: - # Is this a draft model load parameter? - if model_type == "draft": - return config.draft_model.get(key) - elif model_type == "embedding": - return config.embeddings.get(key) - else: - return config.model.get(key) - - async def check_model_container(): """FastAPI depends that checks if a model isn't loaded or currently loading.""" diff --git a/common/tabby_config.py b/common/tabby_config.py index 215f9d0..704e3ba 100644 --- a/common/tabby_config.py +++ b/common/tabby_config.py @@ -7,6 +7,9 @@ from common.utils import unwrap, merge_dicts class TabbyConfig: + """Common config class for TabbyAPI. Loaded into sub-dictionaries from YAML file.""" + + # Sub-blocks of yaml network: dict = {} logging: dict = {} model: dict = {} @@ -16,6 +19,9 @@ class TabbyConfig: developer: dict = {} embeddings: dict = {} + # Persistent defaults + model_defaults: dict = {} + def load(self, arguments: Optional[dict] = None): """Synchronously loads the global application config""" @@ -36,6 +42,14 @@ class TabbyConfig: self.developer = unwrap(merged_config.get("developer"), {}) self.embeddings = unwrap(merged_config.get("embeddings"), {}) + # Set model defaults dict once to prevent on-demand reconstruction + default_keys = unwrap(self.model.get("use_as_default"), []) + for key in default_keys: + if key in self.model: + self.model_defaults[key] = config.model[key] + elif key in self.draft_model: + self.model_defaults[key] = config.draft_model[key] + def _from_file(self, config_path: pathlib.Path): """loads config from a given file path""" diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 154a906..e8bd882 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -5,7 +5,8 @@ from time import time from typing import List, Literal, Optional, Union from common.gen_logging import GenLogPreferences -from common.model import get_config_default +from common.tabby_config import config +from common.utils import unwrap class ModelCardParameters(BaseModel): @@ -51,23 +52,13 @@ class DraftModelLoadRequest(BaseModel): draft_model_name: str # Config arguments - draft_rope_scale: Optional[float] = Field( - default_factory=lambda: get_config_default( - "draft_rope_scale", model_type="draft" - ) - ) + draft_rope_scale: Optional[float] = None draft_rope_alpha: Optional[Union[float, Literal["auto"]]] = Field( description='Automatically calculated if set to "auto"', - default_factory=lambda: get_config_default( - "draft_rope_alpha", model_type="draft" - ), + default=None, examples=[1.0], ) - draft_cache_mode: Optional[str] = Field( - default_factory=lambda: get_config_default( - "draft_cache_mode", model_type="draft" - ) - ) + draft_cache_mode: Optional[str] = None class ModelLoadRequest(BaseModel): @@ -78,62 +69,45 @@ class ModelLoadRequest(BaseModel): # Config arguments - # Max seq len is fetched from config.json of the model by default max_seq_len: Optional[int] = Field( description="Leave this blank to use the model's base sequence length", - default_factory=lambda: get_config_default("max_seq_len"), + default=None, examples=[4096], ) override_base_seq_len: Optional[int] = Field( description=( "Overrides the model's base sequence length. " "Leave blank if unsure" ), - default_factory=lambda: get_config_default("override_base_seq_len"), + default=None, examples=[4096], ) cache_size: Optional[int] = Field( description=("Number in tokens, must be greater than or equal to max_seq_len"), - default_factory=lambda: get_config_default("cache_size"), + default=None, examples=[4096], ) - tensor_parallel: Optional[bool] = Field( - default_factory=lambda: get_config_default("tensor_parallel") - ) - gpu_split_auto: Optional[bool] = Field( - default_factory=lambda: get_config_default("gpu_split_auto") - ) - autosplit_reserve: Optional[List[float]] = Field( - default_factory=lambda: get_config_default("autosplit_reserve") - ) + tensor_parallel: Optional[bool] = None + gpu_split_auto: Optional[bool] = None + autosplit_reserve: Optional[List[float]] = None gpu_split: Optional[List[float]] = Field( - default_factory=lambda: get_config_default("gpu_split"), + default=None, examples=[[24.0, 20.0]], ) rope_scale: Optional[float] = Field( description="Automatically pulled from the model's config if not present", - default_factory=lambda: get_config_default("rope_scale"), + default=None, examples=[1.0], ) rope_alpha: Optional[Union[float, Literal["auto"]]] = Field( description='Automatically calculated if set to "auto"', - default_factory=lambda: get_config_default("rope_alpha"), + default=None, examples=[1.0], ) - cache_mode: Optional[str] = Field( - default_factory=lambda: get_config_default("cache_mode") - ) - chunk_size: Optional[int] = Field( - default_factory=lambda: get_config_default("chunk_size") - ) - prompt_template: Optional[str] = Field( - default_factory=lambda: get_config_default("prompt_template") - ) - num_experts_per_token: Optional[int] = Field( - default_factory=lambda: get_config_default("num_experts_per_token") - ) - fasttensors: Optional[bool] = Field( - default_factory=lambda: get_config_default("fasttensors") - ) + cache_mode: Optional[str] = None + chunk_size: Optional[int] = None + prompt_template: Optional[str] = None + num_experts_per_token: Optional[int] = None + fasttensors: Optional[bool] = None # Non-config arguments draft: Optional[DraftModelLoadRequest] = None @@ -142,9 +116,11 @@ class ModelLoadRequest(BaseModel): class EmbeddingModelLoadRequest(BaseModel): name: str + + # Set default from the config embeddings_device: Optional[str] = Field( - default_factory=lambda: get_config_default( - "embeddings_device", model_type="embedding" + default_factory=lambda: unwrap( + config.embeddings.get("embeddings_device"), "cpu" ) )