Model: Default max_seq_len to 4096
A common problem in TabbyAPI is that users who want to get up and running with a model always had issues with max_seq_len causing OOMs. This is because model devs set max context values in the millions which requires a lot of VRAM. To idiot-proof first time setup, make the fallback default 4096 so users can run their models. If a user still wants to use the model's max_seq_len, set it to -1. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
parent
322f9b773a
commit
2096c9bad2
5 changed files with 20 additions and 12 deletions
|
|
@ -235,11 +235,10 @@ class ExllamaV2Container(BaseModelContainer):
|
||||||
|
|
||||||
# Grab the base model's sequence length before overrides for
|
# Grab the base model's sequence length before overrides for
|
||||||
# rope calculations
|
# rope calculations
|
||||||
base_seq_len = self.config.max_seq_len
|
base_seq_len = hf_model.hf_config.max_position_embeddings
|
||||||
|
|
||||||
# Set the target seq len if present
|
# Set the target seq len if present
|
||||||
# Fallback to base_seq_len if not provided
|
target_seq_len = unwrap(kwargs.get("max_seq_len"), 4096)
|
||||||
target_seq_len = unwrap(kwargs.get("max_seq_len"), base_seq_len)
|
|
||||||
|
|
||||||
# Set the rope scale
|
# Set the rope scale
|
||||||
self.config.scale_pos_emb = unwrap(
|
self.config.scale_pos_emb = unwrap(
|
||||||
|
|
@ -247,6 +246,7 @@ class ExllamaV2Container(BaseModelContainer):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Sets rope alpha value.
|
# Sets rope alpha value.
|
||||||
|
# Utilize the model's max_position_embeddings as a base value
|
||||||
# Automatically calculate if unset or defined as an "auto" literal.
|
# Automatically calculate if unset or defined as an "auto" literal.
|
||||||
rope_alpha = unwrap(kwargs.get("rope_alpha"), "auto")
|
rope_alpha = unwrap(kwargs.get("rope_alpha"), "auto")
|
||||||
if rope_alpha == "auto":
|
if rope_alpha == "auto":
|
||||||
|
|
@ -371,7 +371,7 @@ class ExllamaV2Container(BaseModelContainer):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set draft rope alpha. Follows same behavior as model rope alpha.
|
# Set draft rope alpha. Follows same behavior as model rope alpha.
|
||||||
# Use the base sequence length of the model
|
# Use the max_position_embeddings of the model
|
||||||
draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto")
|
draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto")
|
||||||
if draft_rope_alpha == "auto":
|
if draft_rope_alpha == "auto":
|
||||||
self.draft_config.scale_alpha_value = calculate_rope_alpha(
|
self.draft_config.scale_alpha_value = calculate_rope_alpha(
|
||||||
|
|
@ -399,6 +399,8 @@ class ExllamaV2Container(BaseModelContainer):
|
||||||
self.draft_config.max_input_len = chunk_size
|
self.draft_config.max_input_len = chunk_size
|
||||||
self.draft_config.max_attention_size = chunk_size**2
|
self.draft_config.max_attention_size = chunk_size**2
|
||||||
|
|
||||||
|
print(self.config.max_seq_len)
|
||||||
|
|
||||||
# Return the created instance
|
# Return the created instance
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -175,10 +175,10 @@ class ModelConfig(BaseConfigModel):
|
||||||
max_seq_len: Optional[int] = Field(
|
max_seq_len: Optional[int] = Field(
|
||||||
None,
|
None,
|
||||||
description=(
|
description=(
|
||||||
"Max sequence length (default: Empty).\n"
|
"Max sequence length (default: 4096).\n"
|
||||||
"Fetched from the model's base sequence length in config.json by default."
|
"Set to -1 to fetch from the model's config.json"
|
||||||
),
|
),
|
||||||
ge=0,
|
ge=-1,
|
||||||
)
|
)
|
||||||
tensor_parallel: Optional[bool] = Field(
|
tensor_parallel: Optional[bool] = Field(
|
||||||
False,
|
False,
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,7 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs):
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Cannot find inline model overrides. "
|
"Cannot find inline model overrides. "
|
||||||
"Make sure they are nested under a \"model:\" key"
|
'Make sure they are nested under a "model:" key'
|
||||||
)
|
)
|
||||||
|
|
||||||
# Merge draft overrides beforehand
|
# Merge draft overrides beforehand
|
||||||
|
|
@ -148,6 +148,13 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
|
||||||
# Fetch the extra HF configuration options
|
# Fetch the extra HF configuration options
|
||||||
hf_model = await HFModel.from_directory(model_path)
|
hf_model = await HFModel.from_directory(model_path)
|
||||||
|
|
||||||
|
# Override the max sequence length based on user
|
||||||
|
max_seq_len = kwargs.get("max_seq_len")
|
||||||
|
if max_seq_len == -1:
|
||||||
|
kwargs["max_seq_len"] = hf_model.hf_config.max_position_embeddings
|
||||||
|
elif max_seq_len is None:
|
||||||
|
kwargs["max_seq_len"] = 4096
|
||||||
|
|
||||||
# Create a new container and check if the right dependencies are installed
|
# Create a new container and check if the right dependencies are installed
|
||||||
backend = unwrap(kwargs.get("backend"), detect_backend(hf_model))
|
backend = unwrap(kwargs.get("backend"), detect_backend(hf_model))
|
||||||
container_class = _BACKEND_REGISTRY.get(backend)
|
container_class = _BACKEND_REGISTRY.get(backend)
|
||||||
|
|
|
||||||
|
|
@ -39,12 +39,11 @@ class GenerationConfig(BaseModel):
|
||||||
|
|
||||||
class HuggingFaceConfig(BaseModel):
|
class HuggingFaceConfig(BaseModel):
|
||||||
"""
|
"""
|
||||||
DEPRECATED: Currently a stub and doesn't do anything.
|
|
||||||
|
|
||||||
An abridged version of HuggingFace's model config.
|
An abridged version of HuggingFace's model config.
|
||||||
Will be expanded as needed.
|
Will be expanded as needed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
max_position_embeddings: int = 4096
|
||||||
eos_token_id: Optional[Union[int, List[int]]] = None
|
eos_token_id: Optional[Union[int, List[int]]] = None
|
||||||
quantization_config: Optional[Dict] = None
|
quantization_config: Optional[Dict] = None
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -78,8 +78,8 @@ model:
|
||||||
# Options: exllamav2, exllamav3
|
# Options: exllamav2, exllamav3
|
||||||
backend:
|
backend:
|
||||||
|
|
||||||
# Max sequence length (default: Empty).
|
# Max sequence length (default: 4096).
|
||||||
# Fetched from the model's base sequence length in config.json by default.
|
# Set to -1 to fetch from the model's config.json
|
||||||
max_seq_len:
|
max_seq_len:
|
||||||
|
|
||||||
# Load model with tensor parallelism.
|
# Load model with tensor parallelism.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue