From 767e6a798a334eb24cd5b33014afe6327a527e1d Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Sun, 26 May 2024 14:17:01 -0700
Subject: [PATCH] API + Model: Add support for specifying k/v cache size

---
 backends/exllamav2/model.py  | 37 +++++++++++++++++++++++++++++++-----
 common/args.py               |  5 +++++
 config_sample.yml            |  6 ++++++
 endpoints/OAI/types/model.py |  8 ++++++++
 4 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index e1a85d3..7ffc9ce 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -58,6 +58,7 @@ class ExllamaV2Container:
     paged: bool = True
 
     # Internal config vars
+    cache_size: int = None
     cache_mode: str = "FP16"
     max_batch_size: int = 20
     generation_config: Optional[GenerationConfig] = None
@@ -91,9 +92,11 @@ class ExllamaV2Container:
                              loading_draft: bool)
             **kwargs:
                 `cache_mode` (str): Sets cache mode, "FP16" or "FP8"
-                    (defaulf: "FP16")
+                    (default: "FP16")
                 'max_seq_len' (int): Override model's default max sequence
                     length (default: 4096)
+                'cache_size' (int): Num of tokens to allocate space for in the k/v cache
+                    (default: max_seq_len)
                 'rope_scale' (float): Set RoPE scaling factor for model
                     (default: 1.0)
                 'rope_alpha' (float): Set RoPE alpha (NTK) factor for model
@@ -189,6 +192,16 @@ class ExllamaV2Container:
             kwargs.get("rope_alpha"), self.calculate_rope_alpha(base_seq_len)
         )
 
+        # Set k/v cache size
+        self.cache_size = unwrap(kwargs.get("cache_size"), self.config.max_seq_len)
+        if self.cache_size < self.config.max_seq_len:
+            logger.warning(
+                "Your specified cache_size is smaller than your "
+                "desired context length. \n"
+                "Defaulting cache_size to max_seq_len."
+            )
+            self.cache_size = self.config.max_seq_len
+
         # Enable fasttensors loading if present
         self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False)
 
@@ -359,6 +372,7 @@ class ExllamaV2Container:
             "rope_scale": self.config.scale_pos_emb,
             "rope_alpha": self.config.scale_alpha_value,
             "max_seq_len": self.config.max_seq_len,
+            "cache_size": self.cache_size,
             "cache_mode": self.cache_mode,
             "chunk_size": self.config.max_input_len,
             "num_experts_per_token": self.config.num_experts_per_token,
@@ -484,7 +498,11 @@ class ExllamaV2Container:
             if not self.quiet:
                 logger.info("Loading draft model: " + self.draft_config.model_dir)
 
-            self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True)
+            self.draft_cache = ExLlamaV2Cache(
+                self.draft_model,
+                max_seq_len=self.cache_size,
+                lazy=True,
+            )
             for value in self.draft_model.load_autosplit_gen(
                 self.draft_cache,
                 reserve_vram=autosplit_reserve,
@@ -516,15 +534,24 @@ class ExllamaV2Container:
 
         if self.cache_mode == "Q4":
             self.cache = ExLlamaV2Cache_Q4(
-                self.model, lazy=self.gpu_split_auto, batch_size=1
+                self.model,
+                max_seq_len=self.cache_size,
+                lazy=self.gpu_split_auto,
+                batch_size=1,
             )
         elif self.cache_mode == "FP8":
             self.cache = ExLlamaV2Cache_8bit(
-                self.model, lazy=self.gpu_split_auto, batch_size=1
+                self.model,
+                max_seq_len=self.cache_size,
+                lazy=self.gpu_split_auto,
+                batch_size=1,
             )
         else:
             self.cache = ExLlamaV2Cache(
-                self.model, lazy=self.gpu_split_auto, batch_size=1
+                self.model,
+                max_seq_len=self.cache_size,
+                lazy=self.gpu_split_auto,
+                batch_size=1,
             )
 
         # Load model with autosplit
diff --git a/common/args.py b/common/args.py
index a1e1282..09c0620 100644
--- a/common/args.py
+++ b/common/args.py
@@ -82,6 +82,11 @@ def add_model_args(parser: argparse.ArgumentParser):
         type=str_to_bool,
         help="Overrides base model context length",
     )
+    model_group.add_argument(
+        "--cache-size",
+        type=int,
+        help="The size of the prompt cache (in number of tokens) to allocate"
+    )
     model_group.add_argument(
         "--rope-scale", type=float, help="Sets rope_scale or compress_pos_emb"
     )
diff --git a/config_sample.yml b/config_sample.yml
index ead55b5..ccc6816 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -77,6 +77,12 @@ model:
   # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
   #override_base_seq_len:
 
+  # Size of the prompt cache to allocate (in number of tokens, must be a multiple of 256)
+  # Larger cache uses more VRAM, but allows for more prompts to be cached and a larger batch of gens to proceed simultanously
+  # The minimum size is max_seq_len, but we recommend setting this to the highest value that will fit on your GPU
+  # Recommend setting this to at least max_seq_len * 2 if you want to use CFG with full-length positive and negative prompts
+  #cache_size:
+
   # Automatically allocate resources to GPUs (default: True)
   # NOTE: Not parsed for single GPU users
   #gpu_split_auto: True
diff --git a/endpoints/OAI/types/model.py b/endpoints/OAI/types/model.py
index 41dc68e..9fdbf92 100644
--- a/endpoints/OAI/types/model.py
+++ b/endpoints/OAI/types/model.py
@@ -15,6 +15,7 @@ class ModelCardParameters(BaseModel):
     max_seq_len: Optional[int] = None
     rope_scale: Optional[float] = 1.0
     rope_alpha: Optional[float] = 1.0
+    cache_size: Optional[int] = None
     cache_mode: Optional[str] = "FP16"
     chunk_size: Optional[int] = 2048
     prompt_template: Optional[str] = None
@@ -72,6 +73,13 @@ class ModelLoadRequest(BaseModel):
         default=None,
         examples=[4096],
     )
+    cache_size: Optional[int] = Field(
+        description=(
+            "Number in tokens, must be greater than or equal to max_seq_len"
+        ),
+        default=None,
+        examples=[4096],
+    )
     gpu_split_auto: Optional[bool] = True
     autosplit_reserve: Optional[List[float]] = [96]
     gpu_split: Optional[List[float]] = Field(