diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 29caa34..fd23a28 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -235,11 +235,10 @@ class ExllamaV2Container(BaseModelContainer): # Grab the base model's sequence length before overrides for # rope calculations - base_seq_len = self.config.max_seq_len + base_seq_len = hf_model.hf_config.max_position_embeddings # Set the target seq len if present - # Fallback to base_seq_len if not provided - target_seq_len = unwrap(kwargs.get("max_seq_len"), base_seq_len) + target_seq_len = unwrap(kwargs.get("max_seq_len"), 4096) # Set the rope scale self.config.scale_pos_emb = unwrap( @@ -247,6 +246,7 @@ class ExllamaV2Container(BaseModelContainer): ) # Sets rope alpha value. + # Utilize the model's max_position_embeddings as a base value # Automatically calculate if unset or defined as an "auto" literal. rope_alpha = unwrap(kwargs.get("rope_alpha"), "auto") if rope_alpha == "auto": @@ -371,7 +371,7 @@ class ExllamaV2Container(BaseModelContainer): ) # Set draft rope alpha. Follows same behavior as model rope alpha. - # Use the base sequence length of the model + # Use the max_position_embeddings of the model draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto") if draft_rope_alpha == "auto": self.draft_config.scale_alpha_value = calculate_rope_alpha( @@ -399,6 +399,8 @@ class ExllamaV2Container(BaseModelContainer): self.draft_config.max_input_len = chunk_size self.draft_config.max_attention_size = chunk_size**2 + print(self.config.max_seq_len) + # Return the created instance return self diff --git a/common/config_models.py b/common/config_models.py index 0958a8e..46a7b5e 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -175,10 +175,10 @@ class ModelConfig(BaseConfigModel): max_seq_len: Optional[int] = Field( None, description=( - "Max sequence length (default: Empty).\n" - "Fetched from the model's base sequence length in config.json by default." + "Max sequence length (default: 4096).\n" + "Set to -1 to fetch from the model's config.json" ), - ge=0, + ge=-1, ) tensor_parallel: Optional[bool] = Field( False, diff --git a/common/model.py b/common/model.py index 1935025..6534951 100644 --- a/common/model.py +++ b/common/model.py @@ -95,7 +95,7 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs): else: logger.warning( "Cannot find inline model overrides. " - "Make sure they are nested under a \"model:\" key" + 'Make sure they are nested under a "model:" key' ) # Merge draft overrides beforehand @@ -148,6 +148,13 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): # Fetch the extra HF configuration options hf_model = await HFModel.from_directory(model_path) + # Override the max sequence length based on user + max_seq_len = kwargs.get("max_seq_len") + if max_seq_len == -1: + kwargs["max_seq_len"] = hf_model.hf_config.max_position_embeddings + elif max_seq_len is None: + kwargs["max_seq_len"] = 4096 + # Create a new container and check if the right dependencies are installed backend = unwrap(kwargs.get("backend"), detect_backend(hf_model)) container_class = _BACKEND_REGISTRY.get(backend) diff --git a/common/transformers_utils.py b/common/transformers_utils.py index 6cac3b8..a7b0f0c 100644 --- a/common/transformers_utils.py +++ b/common/transformers_utils.py @@ -39,12 +39,11 @@ class GenerationConfig(BaseModel): class HuggingFaceConfig(BaseModel): """ - DEPRECATED: Currently a stub and doesn't do anything. - An abridged version of HuggingFace's model config. Will be expanded as needed. """ + max_position_embeddings: int = 4096 eos_token_id: Optional[Union[int, List[int]]] = None quantization_config: Optional[Dict] = None diff --git a/config_sample.yml b/config_sample.yml index ffe2605..1ee82a9 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -78,8 +78,8 @@ model: # Options: exllamav2, exllamav3 backend: - # Max sequence length (default: Empty). - # Fetched from the model's base sequence length in config.json by default. + # Max sequence length (default: 4096). + # Set to -1 to fetch from the model's config.json max_seq_len: # Load model with tensor parallelism.