From d759a155597418eb6ebd8bf21af2bae4bf588efc Mon Sep 17 00:00:00 2001 From: kingbri Date: Sun, 7 Apr 2024 18:10:50 -0400 Subject: [PATCH] Model: Fix chunk size handling Wrong class attribute name used for max_attention_size and fixes declaration of the draft model's chunk_size. Also expose the parameter to the end user in both config and model load. Signed-off-by: kingbri --- backends/exllamav2/model.py | 15 ++++++++------- config_sample.yml | 4 ++++ endpoints/OAI/types/model.py | 2 ++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 5e10845..c8cc6e5 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -209,11 +209,11 @@ class ExllamaV2Container: if num_experts_override: self.config.num_experts_per_token = kwargs.get("num_experts_per_token") - chunk_size = min( - unwrap(kwargs.get("chunk_size"), 2048), self.config.max_seq_len - ) + # Make sure chunk size is >= 16 and <= max seq length + user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048) + chunk_size = sorted((16, user_chunk_size, self.config.max_seq_len))[1] self.config.max_input_len = chunk_size - self.config.max_attn_size = chunk_size**2 + self.config.max_attention_size = chunk_size**2 draft_args = unwrap(kwargs.get("draft"), {}) draft_model_name = draft_args.get("draft_model_name") @@ -248,9 +248,9 @@ class ExllamaV2Container: ) self.draft_config.max_seq_len = self.config.max_seq_len - if "chunk_size" in kwargs: - self.draft_config.max_input_len = kwargs["chunk_size"] - self.draft_config.max_attn_size = kwargs["chunk_size"] ** 2 + if chunk_size: + self.draft_config.max_input_len = chunk_size + self.draft_config.max_attention_size = chunk_size**2 def find_prompt_template(self, prompt_template_name, model_directory): """Tries to find a prompt template using various methods""" @@ -320,6 +320,7 @@ class ExllamaV2Container: "rope_alpha": self.config.scale_alpha_value, "max_seq_len": self.config.max_seq_len, "cache_mode": self.cache_mode, + "chunk_size": self.config.max_input_len, "num_experts_per_token": self.config.num_experts_per_token, "use_cfg": self.use_cfg, "prompt_template": self.prompt_template.name diff --git a/config_sample.yml b/config_sample.yml index 11c6555..9cc8337 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -107,6 +107,10 @@ model: # Possible values FP16, FP8, Q4. (default: FP16) #cache_mode: FP16 + # Chunk size for prompt ingestion. A lower value reduces VRAM usage at the cost of ingestion speed (default: 2048) + # NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096 + #chunk_size: 2048 + # Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None) # If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name # of the template you want to use. diff --git a/endpoints/OAI/types/model.py b/endpoints/OAI/types/model.py index a472982..22895be 100644 --- a/endpoints/OAI/types/model.py +++ b/endpoints/OAI/types/model.py @@ -16,6 +16,7 @@ class ModelCardParameters(BaseModel): rope_scale: Optional[float] = 1.0 rope_alpha: Optional[float] = 1.0 cache_mode: Optional[str] = "FP16" + chunk_size: Optional[int] = 2048 prompt_template: Optional[str] = None num_experts_per_token: Optional[int] = None use_cfg: Optional[bool] = None @@ -90,6 +91,7 @@ class ModelLoadRequest(BaseModel): no_flash_attention: Optional[bool] = False # low_mem: Optional[bool] = False cache_mode: Optional[str] = "FP16" + chunk_size: Optional[int] = 2048 prompt_template: Optional[str] = None num_experts_per_token: Optional[int] = None use_cfg: Optional[bool] = None