From 767e6a798a334eb24cd5b33014afe6327a527e1d Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Sun, 26 May 2024 14:17:01 -0700 Subject: [PATCH] API + Model: Add support for specifying k/v cache size --- backends/exllamav2/model.py | 37 +++++++++++++++++++++++++++++++----- common/args.py | 5 +++++ config_sample.yml | 6 ++++++ endpoints/OAI/types/model.py | 8 ++++++++ 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index e1a85d3..7ffc9ce 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -58,6 +58,7 @@ class ExllamaV2Container: paged: bool = True # Internal config vars + cache_size: int = None cache_mode: str = "FP16" max_batch_size: int = 20 generation_config: Optional[GenerationConfig] = None @@ -91,9 +92,11 @@ class ExllamaV2Container: loading_draft: bool) **kwargs: `cache_mode` (str): Sets cache mode, "FP16" or "FP8" - (defaulf: "FP16") + (default: "FP16") 'max_seq_len' (int): Override model's default max sequence length (default: 4096) + 'cache_size' (int): Num of tokens to allocate space for in the k/v cache + (default: max_seq_len) 'rope_scale' (float): Set RoPE scaling factor for model (default: 1.0) 'rope_alpha' (float): Set RoPE alpha (NTK) factor for model @@ -189,6 +192,16 @@ class ExllamaV2Container: kwargs.get("rope_alpha"), self.calculate_rope_alpha(base_seq_len) ) + # Set k/v cache size + self.cache_size = unwrap(kwargs.get("cache_size"), self.config.max_seq_len) + if self.cache_size < self.config.max_seq_len: + logger.warning( + "Your specified cache_size is smaller than your " + "desired context length. \n" + "Defaulting cache_size to max_seq_len." + ) + self.cache_size = self.config.max_seq_len + # Enable fasttensors loading if present self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False) @@ -359,6 +372,7 @@ class ExllamaV2Container: "rope_scale": self.config.scale_pos_emb, "rope_alpha": self.config.scale_alpha_value, "max_seq_len": self.config.max_seq_len, + "cache_size": self.cache_size, "cache_mode": self.cache_mode, "chunk_size": self.config.max_input_len, "num_experts_per_token": self.config.num_experts_per_token, @@ -484,7 +498,11 @@ class ExllamaV2Container: if not self.quiet: logger.info("Loading draft model: " + self.draft_config.model_dir) - self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True) + self.draft_cache = ExLlamaV2Cache( + self.draft_model, + max_seq_len=self.cache_size, + lazy=True, + ) for value in self.draft_model.load_autosplit_gen( self.draft_cache, reserve_vram=autosplit_reserve, @@ -516,15 +534,24 @@ class ExllamaV2Container: if self.cache_mode == "Q4": self.cache = ExLlamaV2Cache_Q4( - self.model, lazy=self.gpu_split_auto, batch_size=1 + self.model, + max_seq_len=self.cache_size, + lazy=self.gpu_split_auto, + batch_size=1, ) elif self.cache_mode == "FP8": self.cache = ExLlamaV2Cache_8bit( - self.model, lazy=self.gpu_split_auto, batch_size=1 + self.model, + max_seq_len=self.cache_size, + lazy=self.gpu_split_auto, + batch_size=1, ) else: self.cache = ExLlamaV2Cache( - self.model, lazy=self.gpu_split_auto, batch_size=1 + self.model, + max_seq_len=self.cache_size, + lazy=self.gpu_split_auto, + batch_size=1, ) # Load model with autosplit diff --git a/common/args.py b/common/args.py index a1e1282..09c0620 100644 --- a/common/args.py +++ b/common/args.py @@ -82,6 +82,11 @@ def add_model_args(parser: argparse.ArgumentParser): type=str_to_bool, help="Overrides base model context length", ) + model_group.add_argument( + "--cache-size", + type=int, + help="The size of the prompt cache (in number of tokens) to allocate" + ) model_group.add_argument( "--rope-scale", type=float, help="Sets rope_scale or compress_pos_emb" ) diff --git a/config_sample.yml b/config_sample.yml index ead55b5..ccc6816 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -77,6 +77,12 @@ model: # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B) #override_base_seq_len: + # Size of the prompt cache to allocate (in number of tokens, must be a multiple of 256) + # Larger cache uses more VRAM, but allows for more prompts to be cached and a larger batch of gens to proceed simultanously + # The minimum size is max_seq_len, but we recommend setting this to the highest value that will fit on your GPU + # Recommend setting this to at least max_seq_len * 2 if you want to use CFG with full-length positive and negative prompts + #cache_size: + # Automatically allocate resources to GPUs (default: True) # NOTE: Not parsed for single GPU users #gpu_split_auto: True diff --git a/endpoints/OAI/types/model.py b/endpoints/OAI/types/model.py index 41dc68e..9fdbf92 100644 --- a/endpoints/OAI/types/model.py +++ b/endpoints/OAI/types/model.py @@ -15,6 +15,7 @@ class ModelCardParameters(BaseModel): max_seq_len: Optional[int] = None rope_scale: Optional[float] = 1.0 rope_alpha: Optional[float] = 1.0 + cache_size: Optional[int] = None cache_mode: Optional[str] = "FP16" chunk_size: Optional[int] = 2048 prompt_template: Optional[str] = None @@ -72,6 +73,13 @@ class ModelLoadRequest(BaseModel): default=None, examples=[4096], ) + cache_size: Optional[int] = Field( + description=( + "Number in tokens, must be greater than or equal to max_seq_len" + ), + default=None, + examples=[4096], + ) gpu_split_auto: Optional[bool] = True autosplit_reserve: Optional[List[float]] = [96] gpu_split: Optional[List[float]] = Field(