Model: Default max_seq_len to 4096

A common problem in TabbyAPI is that users who want to get up and
running with a model always had issues with max_seq_len causing OOMs.
This is because model devs set max context values in the millions which
requires a lot of VRAM.

To idiot-proof first time setup, make the fallback default 4096 so
users can run their models. If a user still wants to use the model's
max_seq_len, set it to -1.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
kingbri 2025-06-13 14:12:03 -04:00
parent 322f9b773a
commit 2096c9bad2
5 changed files with 20 additions and 12 deletions

View file

@ -235,11 +235,10 @@ class ExllamaV2Container(BaseModelContainer):
# Grab the base model's sequence length before overrides for # Grab the base model's sequence length before overrides for
# rope calculations # rope calculations
base_seq_len = self.config.max_seq_len base_seq_len = hf_model.hf_config.max_position_embeddings
# Set the target seq len if present # Set the target seq len if present
# Fallback to base_seq_len if not provided target_seq_len = unwrap(kwargs.get("max_seq_len"), 4096)
target_seq_len = unwrap(kwargs.get("max_seq_len"), base_seq_len)
# Set the rope scale # Set the rope scale
self.config.scale_pos_emb = unwrap( self.config.scale_pos_emb = unwrap(
@ -247,6 +246,7 @@ class ExllamaV2Container(BaseModelContainer):
) )
# Sets rope alpha value. # Sets rope alpha value.
# Utilize the model's max_position_embeddings as a base value
# Automatically calculate if unset or defined as an "auto" literal. # Automatically calculate if unset or defined as an "auto" literal.
rope_alpha = unwrap(kwargs.get("rope_alpha"), "auto") rope_alpha = unwrap(kwargs.get("rope_alpha"), "auto")
if rope_alpha == "auto": if rope_alpha == "auto":
@ -371,7 +371,7 @@ class ExllamaV2Container(BaseModelContainer):
) )
# Set draft rope alpha. Follows same behavior as model rope alpha. # Set draft rope alpha. Follows same behavior as model rope alpha.
# Use the base sequence length of the model # Use the max_position_embeddings of the model
draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto") draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto")
if draft_rope_alpha == "auto": if draft_rope_alpha == "auto":
self.draft_config.scale_alpha_value = calculate_rope_alpha( self.draft_config.scale_alpha_value = calculate_rope_alpha(
@ -399,6 +399,8 @@ class ExllamaV2Container(BaseModelContainer):
self.draft_config.max_input_len = chunk_size self.draft_config.max_input_len = chunk_size
self.draft_config.max_attention_size = chunk_size**2 self.draft_config.max_attention_size = chunk_size**2
print(self.config.max_seq_len)
# Return the created instance # Return the created instance
return self return self

View file

@ -175,10 +175,10 @@ class ModelConfig(BaseConfigModel):
max_seq_len: Optional[int] = Field( max_seq_len: Optional[int] = Field(
None, None,
description=( description=(
"Max sequence length (default: Empty).\n" "Max sequence length (default: 4096).\n"
"Fetched from the model's base sequence length in config.json by default." "Set to -1 to fetch from the model's config.json"
), ),
ge=0, ge=-1,
) )
tensor_parallel: Optional[bool] = Field( tensor_parallel: Optional[bool] = Field(
False, False,

View file

@ -95,7 +95,7 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs):
else: else:
logger.warning( logger.warning(
"Cannot find inline model overrides. " "Cannot find inline model overrides. "
"Make sure they are nested under a \"model:\" key" 'Make sure they are nested under a "model:" key'
) )
# Merge draft overrides beforehand # Merge draft overrides beforehand
@ -148,6 +148,13 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
# Fetch the extra HF configuration options # Fetch the extra HF configuration options
hf_model = await HFModel.from_directory(model_path) hf_model = await HFModel.from_directory(model_path)
# Override the max sequence length based on user
max_seq_len = kwargs.get("max_seq_len")
if max_seq_len == -1:
kwargs["max_seq_len"] = hf_model.hf_config.max_position_embeddings
elif max_seq_len is None:
kwargs["max_seq_len"] = 4096
# Create a new container and check if the right dependencies are installed # Create a new container and check if the right dependencies are installed
backend = unwrap(kwargs.get("backend"), detect_backend(hf_model)) backend = unwrap(kwargs.get("backend"), detect_backend(hf_model))
container_class = _BACKEND_REGISTRY.get(backend) container_class = _BACKEND_REGISTRY.get(backend)

View file

@ -39,12 +39,11 @@ class GenerationConfig(BaseModel):
class HuggingFaceConfig(BaseModel): class HuggingFaceConfig(BaseModel):
""" """
DEPRECATED: Currently a stub and doesn't do anything.
An abridged version of HuggingFace's model config. An abridged version of HuggingFace's model config.
Will be expanded as needed. Will be expanded as needed.
""" """
max_position_embeddings: int = 4096
eos_token_id: Optional[Union[int, List[int]]] = None eos_token_id: Optional[Union[int, List[int]]] = None
quantization_config: Optional[Dict] = None quantization_config: Optional[Dict] = None

View file

@ -78,8 +78,8 @@ model:
# Options: exllamav2, exllamav3 # Options: exllamav2, exllamav3
backend: backend:
# Max sequence length (default: Empty). # Max sequence length (default: 4096).
# Fetched from the model's base sequence length in config.json by default. # Set to -1 to fetch from the model's config.json
max_seq_len: max_seq_len:
# Load model with tensor parallelism. # Load model with tensor parallelism.