Model: Default max_seq_len to 4096

A common problem in TabbyAPI is that users who want to get up and running with a model always had issues with max_seq_len causing OOMs. This is because model devs set max context values in the millions which requires a lot of VRAM. To idiot-proof first time setup, make the fallback default 4096 so users can run their models. If a user still wants to use the model's max_seq_len, set it to -1. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2025-06-13 14:12:03 -04:00 · 2025-06-13 14:12:03 -04:00 · 2096c9bad2
commit 2096c9bad2
parent 322f9b773a
5 changed files with 20 additions and 12 deletions
--- a/common/config_models.py
+++ b/common/config_models.py
@ -175,10 +175,10 @@ class ModelConfig(BaseConfigModel):
    max_seq_len: Optional[int] = Field(
        None,
        description=(
-            "Max sequence length (default: Empty).\n"
-            "Fetched from the model's base sequence length in config.json by default."
+            "Max sequence length (default: 4096).\n"
+            "Set to -1 to fetch from the model's config.json"
        ),
-        ge=0,
+        ge=-1,
    )
    tensor_parallel: Optional[bool] = Field(
        False,
--- a/common/model.py
+++ b/common/model.py
@ -95,7 +95,7 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs):
        else:
            logger.warning(
                "Cannot find inline model overrides. "
-                "Make sure they are nested under a \"model:\" key"
+                'Make sure they are nested under a "model:" key'
            )

        # Merge draft overrides beforehand
@ -148,6 +148,13 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
    # Fetch the extra HF configuration options
    hf_model = await HFModel.from_directory(model_path)

+    # Override the max sequence length based on user
+    max_seq_len = kwargs.get("max_seq_len")
+    if max_seq_len == -1:
+        kwargs["max_seq_len"] = hf_model.hf_config.max_position_embeddings
+    elif max_seq_len is None:
+        kwargs["max_seq_len"] = 4096
+
    # Create a new container and check if the right dependencies are installed
    backend = unwrap(kwargs.get("backend"), detect_backend(hf_model))
    container_class = _BACKEND_REGISTRY.get(backend)
--- a/common/transformers_utils.py
+++ b/common/transformers_utils.py
@ -39,12 +39,11 @@ class GenerationConfig(BaseModel):

 class HuggingFaceConfig(BaseModel):
    """
-    DEPRECATED: Currently a stub and doesn't do anything.
-
    An abridged version of HuggingFace's model config.
    Will be expanded as needed.
    """

+    max_position_embeddings: int = 4096
    eos_token_id: Optional[Union[int, List[int]]] = None
    quantization_config: Optional[Dict] = None