Config + Model: Allow for default fallbacks from config for model loads
Previously, the parameters under the "model" block in config.yml only handled the loading of a model on startup. This meant that any subsequent API request required each parameter to be filled out or use a sane default (usually defaults to the model's config.json). However, there are cases where admins may want an argument from the config to apply if the parameter isn't provided in the request body. To help alleviate this, add a mechanism that works like sampler overrides where users can specify a flag that acts as a fallback. Therefore, this change both preserves the source of truth of what parameters the admin is loading and adds some convenience for users that want customizable defaults for their requests. This behavior may change in the future, but I think it solves the issue for now. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
parent
d03752e31b
commit
27d2d5f3d2
3 changed files with 84 additions and 20 deletions
|
|
@ -5,6 +5,7 @@ from time import time
|
|||
from typing import List, Optional
|
||||
|
||||
from common.gen_logging import GenLogPreferences
|
||||
from common.model import get_config_default
|
||||
|
||||
|
||||
class ModelCardParameters(BaseModel):
|
||||
|
|
@ -46,60 +47,92 @@ class ModelList(BaseModel):
|
|||
class DraftModelLoadRequest(BaseModel):
|
||||
"""Represents a draft model load request."""
|
||||
|
||||
# Required
|
||||
draft_model_name: str
|
||||
draft_rope_scale: Optional[float] = 1.0
|
||||
|
||||
# Config arguments
|
||||
draft_rope_scale: Optional[float] = Field(
|
||||
default_factory=lambda: get_config_default(
|
||||
"draft_rope_scale", 1.0, is_draft=True
|
||||
)
|
||||
)
|
||||
draft_rope_alpha: Optional[float] = Field(
|
||||
description="Automatically calculated if not present",
|
||||
default=None,
|
||||
default_factory=lambda: get_config_default(
|
||||
"draft_rope_alpha", None, is_draft=True
|
||||
),
|
||||
examples=[1.0],
|
||||
)
|
||||
draft_cache_mode: Optional[str] = "FP16"
|
||||
draft_cache_mode: Optional[str] = Field(
|
||||
default_factory=lambda: get_config_default(
|
||||
"draft_cache_mode", "FP16", is_draft=True
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class ModelLoadRequest(BaseModel):
|
||||
"""Represents a model load request."""
|
||||
|
||||
# Required
|
||||
name: str
|
||||
|
||||
# Config arguments
|
||||
|
||||
# Max seq len is fetched from config.json of the model by default
|
||||
max_seq_len: Optional[int] = Field(
|
||||
description="Leave this blank to use the model's base sequence length",
|
||||
default=None,
|
||||
default_factory=lambda: get_config_default("max_seq_len"),
|
||||
examples=[4096],
|
||||
)
|
||||
override_base_seq_len: Optional[int] = Field(
|
||||
description=(
|
||||
"Overrides the model's base sequence length. " "Leave blank if unsure"
|
||||
),
|
||||
default=None,
|
||||
default_factory=lambda: get_config_default("override_base_seq_len"),
|
||||
examples=[4096],
|
||||
)
|
||||
cache_size: Optional[int] = Field(
|
||||
description=("Number in tokens, must be greater than or equal to max_seq_len"),
|
||||
default=None,
|
||||
default_factory=lambda: get_config_default("cache_size"),
|
||||
examples=[4096],
|
||||
)
|
||||
gpu_split_auto: Optional[bool] = True
|
||||
autosplit_reserve: Optional[List[float]] = [96]
|
||||
gpu_split_auto: Optional[bool] = Field(
|
||||
default_factory=lambda: get_config_default("gpu_split_auto", True)
|
||||
)
|
||||
autosplit_reserve: Optional[List[float]] = Field(
|
||||
default_factory=lambda: get_config_default("autosplit_reserve", [96])
|
||||
)
|
||||
gpu_split: Optional[List[float]] = Field(
|
||||
default_factory=list, examples=[[24.0, 20.0]]
|
||||
default_factory=lambda: get_config_default("gpu_split", []),
|
||||
examples=[[24.0, 20.0]],
|
||||
)
|
||||
rope_scale: Optional[float] = Field(
|
||||
description="Automatically pulled from the model's config if not present",
|
||||
default=None,
|
||||
default_factory=lambda: get_config_default("rope_scale"),
|
||||
examples=[1.0],
|
||||
)
|
||||
rope_alpha: Optional[float] = Field(
|
||||
description="Automatically calculated if not present",
|
||||
default=None,
|
||||
default_factory=lambda: get_config_default("rope_alpha"),
|
||||
examples=[1.0],
|
||||
)
|
||||
# low_mem: Optional[bool] = False
|
||||
cache_mode: Optional[str] = "FP16"
|
||||
chunk_size: Optional[int] = 2048
|
||||
prompt_template: Optional[str] = None
|
||||
num_experts_per_token: Optional[int] = None
|
||||
fasttensors: Optional[bool] = False
|
||||
cache_mode: Optional[str] = Field(
|
||||
default_factory=lambda: get_config_default("cache_mode", "FP16")
|
||||
)
|
||||
chunk_size: Optional[int] = Field(
|
||||
default_factory=lambda: get_config_default("chunk_size", 2048)
|
||||
)
|
||||
prompt_template: Optional[str] = Field(
|
||||
default_factory=lambda: get_config_default("prompt_template")
|
||||
)
|
||||
num_experts_per_token: Optional[int] = Field(
|
||||
default_factory=lambda: get_config_default("num_experts_per_token")
|
||||
)
|
||||
fasttensors: Optional[bool] = Field(
|
||||
default_factory=lambda: get_config_default("fasttensors", False)
|
||||
)
|
||||
|
||||
# Non-config arguments
|
||||
draft: Optional[DraftModelLoadRequest] = None
|
||||
skip_queue: Optional[bool] = False
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue