Config + Model: Allow for default fallbacks from config for model loads

Previously, the parameters under the "model" block in config.yml only handled the loading of a model on startup. This meant that any subsequent API request required each parameter to be filled out or use a sane default (usually defaults to the model's config.json). However, there are cases where admins may want an argument from the config to apply if the parameter isn't provided in the request body. To help alleviate this, add a mechanism that works like sampler overrides where users can specify a flag that acts as a fallback. Therefore, this change both preserves the source of truth of what parameters the admin is loading and adds some convenience for users that want customizable defaults for their requests. This behavior may change in the future, but I think it solves the issue for now. Signed-off-by: kingbri <bdashore3@proton.me>
2024-07-06 17:29:59 -04:00 · 2024-07-06 17:29:59 -04:00 · 27d2d5f3d2
commit 27d2d5f3d2
parent d03752e31b
3 changed files with 84 additions and 20 deletions
--- a/common/model.py
+++ b/common/model.py
@ -9,7 +9,9 @@ from loguru import logger
 from typing import Optional

 from backends.exllamav2.model import ExllamaV2Container
+from common import config
 from common.logger import get_loading_progress_bar
+from common.utils import unwrap

 # Global model container
 container: Optional[ExllamaV2Container] = None
@ -91,3 +93,19 @@ async def load_loras(lora_dir, **kwargs):
 async def unload_loras():
    """Wrapper to unload loras"""
    await container.unload(loras_only=True)
+
+
+def get_config_default(key, fallback=None, is_draft=False):
+    """Fetches a default value from model config if allowed by the user."""
+
+    model_config = config.model_config()
+    default_keys = unwrap(model_config.get("use_as_default"), [])
+    if key in default_keys:
+        # Is this a draft model load parameter?
+        if is_draft:
+            draft_config = config.draft_model_config()
+            return unwrap(draft_config.get(key), fallback)
+        else:
+            return unwrap(model_config.get(key), fallback)
+    else:
+        return fallback
--- a/config_sample.yml
+++ b/config_sample.yml
@ -51,19 +51,29 @@ developer:
  #cuda_malloc_backend: False

 # Options for model overrides and loading
+# Please read the comments to understand how arguments are handled between initial and API loads
 model:
  # Overrides the directory to look for models (default: models)
  # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.
  model_dir: models

+  # Sends dummy model names when the models endpoint is queried
+  # Enable this if the program is looking for a specific OAI model
+  #use_dummy_models: False
+
  # An initial model to load. Make sure the model is located in the model directory!
  # A model can be loaded later via the API.
  # REQUIRED: This must be filled out to load a model on startup!
  model_name:

-  # Sends dummy model names when the models endpoint is queried
-  # Enable this if the program is looking for a specific OAI model
-  #use_dummy_models: False
+  # The below parameters only apply for initial loads
+  # All API based loads do NOT inherit these settings unless specified in use_as_default
+
+  # Names of args to use as a default fallback for API load requests (default: [])
+  # For example, if you always want cache_mode to be Q4 instead of on the inital model load,
+  # Add "cache_mode" to this array
+  # Ex. ["max_seq_len", "cache_mode"]
+  #use_as_default: []

  # The below parameters apply only if model_name is set

@ -143,6 +153,9 @@ model:
    # A draft model can be loaded later via the API.
    #draft_model_name: A model name
  
+    # The below parameters only apply for initial loads
+    # All API based loads do NOT inherit these settings unless specified in use_as_default
+
    # Rope scale for draft models (default: 1.0)
    # Same thing as compress_pos_emb
    # Only use if your draft model was trained on long context with rope (check config.json)
--- a/endpoints/OAI/types/model.py
+++ b/endpoints/OAI/types/model.py
@ -5,6 +5,7 @@ from time import time
 from typing import List, Optional

 from common.gen_logging import GenLogPreferences
+from common.model import get_config_default


 class ModelCardParameters(BaseModel):
@ -46,60 +47,92 @@ class ModelList(BaseModel):
 class DraftModelLoadRequest(BaseModel):
    """Represents a draft model load request."""

+    # Required
    draft_model_name: str
-    draft_rope_scale: Optional[float] = 1.0
+
+    # Config arguments
+    draft_rope_scale: Optional[float] = Field(
+        default_factory=lambda: get_config_default(
+            "draft_rope_scale", 1.0, is_draft=True
+        )
+    )
    draft_rope_alpha: Optional[float] = Field(
        description="Automatically calculated if not present",
-        default=None,
+        default_factory=lambda: get_config_default(
+            "draft_rope_alpha", None, is_draft=True
+        ),
        examples=[1.0],
    )
-    draft_cache_mode: Optional[str] = "FP16"
+    draft_cache_mode: Optional[str] = Field(
+        default_factory=lambda: get_config_default(
+            "draft_cache_mode", "FP16", is_draft=True
+        )
+    )


 class ModelLoadRequest(BaseModel):
    """Represents a model load request."""

+    # Required
    name: str

+    # Config arguments
+
    # Max seq len is fetched from config.json of the model by default
    max_seq_len: Optional[int] = Field(
        description="Leave this blank to use the model's base sequence length",
-        default=None,
+        default_factory=lambda: get_config_default("max_seq_len"),
        examples=[4096],
    )
    override_base_seq_len: Optional[int] = Field(
        description=(
            "Overrides the model's base sequence length. " "Leave blank if unsure"
        ),
-        default=None,
+        default_factory=lambda: get_config_default("override_base_seq_len"),
        examples=[4096],
    )
    cache_size: Optional[int] = Field(
        description=("Number in tokens, must be greater than or equal to max_seq_len"),
-        default=None,
+        default_factory=lambda: get_config_default("cache_size"),
        examples=[4096],
    )
-    gpu_split_auto: Optional[bool] = True
-    autosplit_reserve: Optional[List[float]] = [96]
+    gpu_split_auto: Optional[bool] = Field(
+        default_factory=lambda: get_config_default("gpu_split_auto", True)
+    )
+    autosplit_reserve: Optional[List[float]] = Field(
+        default_factory=lambda: get_config_default("autosplit_reserve", [96])
+    )
    gpu_split: Optional[List[float]] = Field(
-        default_factory=list, examples=[[24.0, 20.0]]
+        default_factory=lambda: get_config_default("gpu_split", []),
+        examples=[[24.0, 20.0]],
    )
    rope_scale: Optional[float] = Field(
        description="Automatically pulled from the model's config if not present",
-        default=None,
+        default_factory=lambda: get_config_default("rope_scale"),
        examples=[1.0],
    )
    rope_alpha: Optional[float] = Field(
        description="Automatically calculated if not present",
-        default=None,
+        default_factory=lambda: get_config_default("rope_alpha"),
        examples=[1.0],
    )
-    # low_mem: Optional[bool] = False
-    cache_mode: Optional[str] = "FP16"
-    chunk_size: Optional[int] = 2048
-    prompt_template: Optional[str] = None
-    num_experts_per_token: Optional[int] = None
-    fasttensors: Optional[bool] = False
+    cache_mode: Optional[str] = Field(
+        default_factory=lambda: get_config_default("cache_mode", "FP16")
+    )
+    chunk_size: Optional[int] = Field(
+        default_factory=lambda: get_config_default("chunk_size", 2048)
+    )
+    prompt_template: Optional[str] = Field(
+        default_factory=lambda: get_config_default("prompt_template")
+    )
+    num_experts_per_token: Optional[int] = Field(
+        default_factory=lambda: get_config_default("num_experts_per_token")
+    )
+    fasttensors: Optional[bool] = Field(
+        default_factory=lambda: get_config_default("fasttensors", False)
+    )
+
+    # Non-config arguments
    draft: Optional[DraftModelLoadRequest] = None
    skip_queue: Optional[bool] = False