diff --git a/common/model.py b/common/model.py index 105daa5..14d3cc0 100644 --- a/common/model.py +++ b/common/model.py @@ -9,7 +9,9 @@ from loguru import logger from typing import Optional from backends.exllamav2.model import ExllamaV2Container +from common import config from common.logger import get_loading_progress_bar +from common.utils import unwrap # Global model container container: Optional[ExllamaV2Container] = None @@ -91,3 +93,19 @@ async def load_loras(lora_dir, **kwargs): async def unload_loras(): """Wrapper to unload loras""" await container.unload(loras_only=True) + + +def get_config_default(key, fallback=None, is_draft=False): + """Fetches a default value from model config if allowed by the user.""" + + model_config = config.model_config() + default_keys = unwrap(model_config.get("use_as_default"), []) + if key in default_keys: + # Is this a draft model load parameter? + if is_draft: + draft_config = config.draft_model_config() + return unwrap(draft_config.get(key), fallback) + else: + return unwrap(model_config.get(key), fallback) + else: + return fallback diff --git a/config_sample.yml b/config_sample.yml index 1c0304d..0e4b180 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -51,19 +51,29 @@ developer: #cuda_malloc_backend: False # Options for model overrides and loading +# Please read the comments to understand how arguments are handled between initial and API loads model: # Overrides the directory to look for models (default: models) # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise. model_dir: models + # Sends dummy model names when the models endpoint is queried + # Enable this if the program is looking for a specific OAI model + #use_dummy_models: False + # An initial model to load. Make sure the model is located in the model directory! # A model can be loaded later via the API. # REQUIRED: This must be filled out to load a model on startup! model_name: - # Sends dummy model names when the models endpoint is queried - # Enable this if the program is looking for a specific OAI model - #use_dummy_models: False + # The below parameters only apply for initial loads + # All API based loads do NOT inherit these settings unless specified in use_as_default + + # Names of args to use as a default fallback for API load requests (default: []) + # For example, if you always want cache_mode to be Q4 instead of on the inital model load, + # Add "cache_mode" to this array + # Ex. ["max_seq_len", "cache_mode"] + #use_as_default: [] # The below parameters apply only if model_name is set @@ -143,6 +153,9 @@ model: # A draft model can be loaded later via the API. #draft_model_name: A model name + # The below parameters only apply for initial loads + # All API based loads do NOT inherit these settings unless specified in use_as_default + # Rope scale for draft models (default: 1.0) # Same thing as compress_pos_emb # Only use if your draft model was trained on long context with rope (check config.json) diff --git a/endpoints/OAI/types/model.py b/endpoints/OAI/types/model.py index c549b49..30730b8 100644 --- a/endpoints/OAI/types/model.py +++ b/endpoints/OAI/types/model.py @@ -5,6 +5,7 @@ from time import time from typing import List, Optional from common.gen_logging import GenLogPreferences +from common.model import get_config_default class ModelCardParameters(BaseModel): @@ -46,60 +47,92 @@ class ModelList(BaseModel): class DraftModelLoadRequest(BaseModel): """Represents a draft model load request.""" + # Required draft_model_name: str - draft_rope_scale: Optional[float] = 1.0 + + # Config arguments + draft_rope_scale: Optional[float] = Field( + default_factory=lambda: get_config_default( + "draft_rope_scale", 1.0, is_draft=True + ) + ) draft_rope_alpha: Optional[float] = Field( description="Automatically calculated if not present", - default=None, + default_factory=lambda: get_config_default( + "draft_rope_alpha", None, is_draft=True + ), examples=[1.0], ) - draft_cache_mode: Optional[str] = "FP16" + draft_cache_mode: Optional[str] = Field( + default_factory=lambda: get_config_default( + "draft_cache_mode", "FP16", is_draft=True + ) + ) class ModelLoadRequest(BaseModel): """Represents a model load request.""" + # Required name: str + # Config arguments + # Max seq len is fetched from config.json of the model by default max_seq_len: Optional[int] = Field( description="Leave this blank to use the model's base sequence length", - default=None, + default_factory=lambda: get_config_default("max_seq_len"), examples=[4096], ) override_base_seq_len: Optional[int] = Field( description=( "Overrides the model's base sequence length. " "Leave blank if unsure" ), - default=None, + default_factory=lambda: get_config_default("override_base_seq_len"), examples=[4096], ) cache_size: Optional[int] = Field( description=("Number in tokens, must be greater than or equal to max_seq_len"), - default=None, + default_factory=lambda: get_config_default("cache_size"), examples=[4096], ) - gpu_split_auto: Optional[bool] = True - autosplit_reserve: Optional[List[float]] = [96] + gpu_split_auto: Optional[bool] = Field( + default_factory=lambda: get_config_default("gpu_split_auto", True) + ) + autosplit_reserve: Optional[List[float]] = Field( + default_factory=lambda: get_config_default("autosplit_reserve", [96]) + ) gpu_split: Optional[List[float]] = Field( - default_factory=list, examples=[[24.0, 20.0]] + default_factory=lambda: get_config_default("gpu_split", []), + examples=[[24.0, 20.0]], ) rope_scale: Optional[float] = Field( description="Automatically pulled from the model's config if not present", - default=None, + default_factory=lambda: get_config_default("rope_scale"), examples=[1.0], ) rope_alpha: Optional[float] = Field( description="Automatically calculated if not present", - default=None, + default_factory=lambda: get_config_default("rope_alpha"), examples=[1.0], ) - # low_mem: Optional[bool] = False - cache_mode: Optional[str] = "FP16" - chunk_size: Optional[int] = 2048 - prompt_template: Optional[str] = None - num_experts_per_token: Optional[int] = None - fasttensors: Optional[bool] = False + cache_mode: Optional[str] = Field( + default_factory=lambda: get_config_default("cache_mode", "FP16") + ) + chunk_size: Optional[int] = Field( + default_factory=lambda: get_config_default("chunk_size", 2048) + ) + prompt_template: Optional[str] = Field( + default_factory=lambda: get_config_default("prompt_template") + ) + num_experts_per_token: Optional[int] = Field( + default_factory=lambda: get_config_default("num_experts_per_token") + ) + fasttensors: Optional[bool] = Field( + default_factory=lambda: get_config_default("fasttensors", False) + ) + + # Non-config arguments draft: Optional[DraftModelLoadRequest] = None skip_queue: Optional[bool] = False