From d759a155597418eb6ebd8bf21af2bae4bf588efc Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Sun, 7 Apr 2024 18:10:50 -0400
Subject: [PATCH] Model: Fix chunk size handling

Wrong class attribute name used for max_attention_size and fixes
declaration of the draft model's chunk_size.

Also expose the parameter to the end user in both config and model
load.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 backends/exllamav2/model.py  | 15 ++++++++-------
 config_sample.yml            |  4 ++++
 endpoints/OAI/types/model.py |  2 ++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 5e10845..c8cc6e5 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -209,11 +209,11 @@ class ExllamaV2Container:
         if num_experts_override:
             self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
 
-        chunk_size = min(
-            unwrap(kwargs.get("chunk_size"), 2048), self.config.max_seq_len
-        )
+        # Make sure chunk size is >= 16 and <= max seq length
+        user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
+        chunk_size = sorted((16, user_chunk_size, self.config.max_seq_len))[1]
         self.config.max_input_len = chunk_size
-        self.config.max_attn_size = chunk_size**2
+        self.config.max_attention_size = chunk_size**2
 
         draft_args = unwrap(kwargs.get("draft"), {})
         draft_model_name = draft_args.get("draft_model_name")
@@ -248,9 +248,9 @@ class ExllamaV2Container:
             )
             self.draft_config.max_seq_len = self.config.max_seq_len
 
-            if "chunk_size" in kwargs:
-                self.draft_config.max_input_len = kwargs["chunk_size"]
-                self.draft_config.max_attn_size = kwargs["chunk_size"] ** 2
+            if chunk_size:
+                self.draft_config.max_input_len = chunk_size
+                self.draft_config.max_attention_size = chunk_size**2
 
     def find_prompt_template(self, prompt_template_name, model_directory):
         """Tries to find a prompt template using various methods"""
@@ -320,6 +320,7 @@ class ExllamaV2Container:
             "rope_alpha": self.config.scale_alpha_value,
             "max_seq_len": self.config.max_seq_len,
             "cache_mode": self.cache_mode,
+            "chunk_size": self.config.max_input_len,
             "num_experts_per_token": self.config.num_experts_per_token,
             "use_cfg": self.use_cfg,
             "prompt_template": self.prompt_template.name
diff --git a/config_sample.yml b/config_sample.yml
index 11c6555..9cc8337 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -107,6 +107,10 @@ model:
   # Possible values FP16, FP8, Q4. (default: FP16)
   #cache_mode: FP16
 
+  # Chunk size for prompt ingestion. A lower value reduces VRAM usage at the cost of ingestion speed (default: 2048)
+  # NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
+  #chunk_size: 2048
+
   # Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
   # If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name
   # of the template you want to use.
diff --git a/endpoints/OAI/types/model.py b/endpoints/OAI/types/model.py
index a472982..22895be 100644
--- a/endpoints/OAI/types/model.py
+++ b/endpoints/OAI/types/model.py
@@ -16,6 +16,7 @@ class ModelCardParameters(BaseModel):
     rope_scale: Optional[float] = 1.0
     rope_alpha: Optional[float] = 1.0
     cache_mode: Optional[str] = "FP16"
+    chunk_size: Optional[int] = 2048
     prompt_template: Optional[str] = None
     num_experts_per_token: Optional[int] = None
     use_cfg: Optional[bool] = None
@@ -90,6 +91,7 @@ class ModelLoadRequest(BaseModel):
     no_flash_attention: Optional[bool] = False
     # low_mem: Optional[bool] = False
     cache_mode: Optional[str] = "FP16"
+    chunk_size: Optional[int] = 2048
     prompt_template: Optional[str] = None
     num_experts_per_token: Optional[int] = None
     use_cfg: Optional[bool] = None