From 68a660bdb3689460d87430a9f277fde835e5c685 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Thu, 1 May 2025 22:55:51 -0700 Subject: [PATCH 1/6] Model: Initial Exl3 cache quantization support --- backends/exllamav2/model.py | 1 + backends/exllamav3/model.py | 32 ++++++++++++++++++++++++++++++-- common/config_models.py | 9 +++++++-- config_sample.yml | 6 ++++-- 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 27fe33f..5d79e4f 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -185,6 +185,7 @@ class ExllamaV2Container(BaseModelContainer): # MARK: User configuration # Get cache mode + # TODO: Separate validation for Exl2 and Exl3 q-cache options self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") # Turn off GPU split if the user is using 1 GPU diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index fc10a3d..18e04e8 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -1,6 +1,7 @@ import asyncio import gc import pathlib +import re import traceback from typing import ( Any, @@ -19,6 +20,7 @@ from exllamav3 import ( Model, Tokenizer, ) +from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant from loguru import logger from backends.base_model_container import BaseModelContainer @@ -73,6 +75,7 @@ class ExllamaV3Container(BaseModelContainer): use_tp: bool = False max_seq_len: int = 4096 cache_size: int = 4096 + cache_mode: str = "FP16" chunk_size: int = 2048 max_batch_size: Optional[int] = None @@ -219,7 +222,32 @@ class ExllamaV3Container(BaseModelContainer): # Cache user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len) self.cache_size = self.adjust_cache_size(user_cache_size) - self.cache = Cache(self.model, max_num_tokens=self.cache_size) + self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") + + # Alias Exl2 q-cache settings + match self.cache_mode: + case "Q4": + self.cache_mode = "4,4" + case "Q6": + self.cache_mode = "6,6" + case "Q8": + self.cache_mode = "8,8" + + split_cache_mode = re.search(r"^([2-8]),([2-8])$", self.cache_mode) + if split_cache_mode: + k_bits = int(split_cache_mode.group(1)) + v_bits = int(split_cache_mode.group(2)) + self.cache = Cache( + self.model, + max_num_tokens=self.cache_size, + layer_type=CacheLayer_quant, + k_bits=k_bits, + v_bits=v_bits, + ) + else: + self.cache = Cache( + self.model, max_num_tokens=self.cache_size, layer_type=CacheLayer_fp16 + ) # Draft cache if self.use_draft_model: @@ -314,7 +342,7 @@ class ExllamaV3Container(BaseModelContainer): max_seq_len=self.max_seq_len, cache_size=self.cache_size, max_batch_size=self.max_batch_size, - # cache_mode=self.cache_mode, + cache_mode=self.cache_mode, chunk_size=self.chunk_size, use_vision=self.use_vision, ) diff --git a/common/config_models.py b/common/config_models.py index 2965fe0..8ee4ff7 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -1,6 +1,7 @@ from pydantic import ( BaseModel, ConfigDict, + constr, Field, PrivateAttr, field_validator, @@ -9,6 +10,7 @@ from typing import List, Literal, Optional, Union CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"] +CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8],[2-8]$")] class Metadata(BaseModel): @@ -227,11 +229,14 @@ class ModelConfig(BaseConfigModel): "or auto-calculate." ), ) - cache_mode: Optional[CACHE_SIZES] = Field( + # TODO: Separate validation for Exl2 and Exl3 q-cache options + cache_mode: Optional[CACHE_TYPE] = Field( "FP16", description=( "Enable different cache modes for VRAM savings (default: FP16).\n" - f"Possible values: {str(CACHE_SIZES)[15:-1]}." + f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n" + "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits " + "are integers from 2-8 (i.e. 8,8)." ), ) cache_size: Optional[int] = Field( diff --git a/config_sample.yml b/config_sample.yml index a743c2c..045db51 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -114,7 +114,8 @@ model: rope_alpha: # Enable different cache modes for VRAM savings (default: FP16). - # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). cache_mode: FP16 # Size of the prompt cache to allocate (default: max_seq_len). @@ -164,7 +165,8 @@ draft_model: draft_rope_alpha: # Cache mode for draft models to save VRAM (default: FP16). - # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). draft_cache_mode: FP16 # An integer array of GBs of VRAM to split between GPUs (default: []). From 58e34ba4c5ca38cfba4bbced01e85c69c5c7c07f Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Thu, 1 May 2025 23:05:41 -0700 Subject: [PATCH 2/6] Model: Exl3 cache quant settings lenient with whitespace --- backends/exllamav3/model.py | 2 +- common/config_models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 18e04e8..330c4e1 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -233,7 +233,7 @@ class ExllamaV3Container(BaseModelContainer): case "Q8": self.cache_mode = "8,8" - split_cache_mode = re.search(r"^([2-8]),([2-8])$", self.cache_mode) + split_cache_mode = re.search(r"^([2-8])\s*,\s*([2-8])$", self.cache_mode) if split_cache_mode: k_bits = int(split_cache_mode.group(1)) v_bits = int(split_cache_mode.group(2)) diff --git a/common/config_models.py b/common/config_models.py index 8ee4ff7..b4a245e 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -10,7 +10,7 @@ from typing import List, Literal, Optional, Union CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"] -CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8],[2-8]$")] +CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8]\s*,\s*[2-8]$")] class Metadata(BaseModel): From a635a719d73d0eb52e29890870a1029568353a78 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Sat, 3 May 2025 20:59:36 -0700 Subject: [PATCH 3/6] Model: Enable draft model q-cache in Exl3 * Remove unneeded default fp16 cache layer import --- backends/exllamav3/model.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 330c4e1..1026faf 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -20,7 +20,7 @@ from exllamav3 import ( Model, Tokenizer, ) -from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant +from exllamav3.cache import CacheLayer_quant from loguru import logger from backends.base_model_container import BaseModelContainer @@ -76,6 +76,7 @@ class ExllamaV3Container(BaseModelContainer): max_seq_len: int = 4096 cache_size: int = 4096 cache_mode: str = "FP16" + draft_cache_mode: str = "FP16" chunk_size: int = 2048 max_batch_size: Optional[int] = None @@ -245,13 +246,35 @@ class ExllamaV3Container(BaseModelContainer): v_bits=v_bits, ) else: - self.cache = Cache( - self.model, max_num_tokens=self.cache_size, layer_type=CacheLayer_fp16 - ) + self.cache = Cache(self.model, max_num_tokens=self.cache_size) # Draft cache if self.use_draft_model: - self.draft_cache = Cache(self.draft_model, max_num_tokens = self.cache_size) + # Set draft cache mode + self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") + + # Alias Exl2 q-cache settings + match self.draft_cache_mode: + case "Q4": + self.draft_cache_mode = "4,4" + case "Q6": + self.draft_cache_mode = "6,6" + case "Q8": + self.draft_cache_mode = "8,8" + + split_draft_cache_mode = re.search(r"^([2-8])\s*,\s*([2-8])$", self.draft_cache_mode) + if split_draft_cache_mode: + draft_k_bits = int(split_draft_cache_mode.group(1)) + draft_v_bits = int(split_draft_cache_mode.group(2)) + self.draft_cache = Cache( + self.draft_model, + max_num_tokens=self.cache_size, + layer_type=CacheLayer_quant, + k_bits=draft_k_bits, + v_bits=draft_v_bits, + ) + else: + self.draft_cache = Cache(self.draft_model, max_num_tokens = self.cache_size) # Max batch size self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256) From 45b966363ec78d5a667a9b38cb571e4c2da6da37 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Sat, 3 May 2025 21:01:03 -0700 Subject: [PATCH 4/6] Tree: Format --- backends/exllamav3/model.py | 12 +++++++----- endpoints/OAI/types/chat_completion.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 1026faf..61986c1 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -165,9 +165,7 @@ class ExllamaV3Container(BaseModelContainer): self.draft_model_dir = draft_model_path self.draft_config = Config.from_directory(str(draft_model_path.resolve())) self.draft_model = Model.from_config(self.draft_config) - logger.info( - f'Using draft model: {str(draft_model_path.resolve())}' - ) + logger.info(f"Using draft model: {str(draft_model_path.resolve())}") else: self.draft_model = None self.craft_cache = None @@ -262,7 +260,9 @@ class ExllamaV3Container(BaseModelContainer): case "Q8": self.draft_cache_mode = "8,8" - split_draft_cache_mode = re.search(r"^([2-8])\s*,\s*([2-8])$", self.draft_cache_mode) + split_draft_cache_mode = re.search( + r"^([2-8])\s*,\s*([2-8])$", self.draft_cache_mode + ) if split_draft_cache_mode: draft_k_bits = int(split_draft_cache_mode.group(1)) draft_v_bits = int(split_draft_cache_mode.group(2)) @@ -274,7 +274,9 @@ class ExllamaV3Container(BaseModelContainer): v_bits=draft_v_bits, ) else: - self.draft_cache = Cache(self.draft_model, max_num_tokens = self.cache_size) + self.draft_cache = Cache( + self.draft_model, max_num_tokens=self.cache_size + ) # Max batch size self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256) diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py index 51695c2..fb73eb9 100644 --- a/endpoints/OAI/types/chat_completion.py +++ b/endpoints/OAI/types/chat_completion.py @@ -84,7 +84,7 @@ class ChatCompletionRequest(CommonCompletionRequest): # Chat completions requests do not have a BOS token preference. Backend # respects the tokenization config for the individual model. - add_bos_token: Optional[bool] = Field(default = None) + add_bos_token: Optional[bool] = Field(default=None) @field_validator("add_bos_token", mode="after") def force_bos_token(cls, v): From 9dcde59c57e2afa7d7c19afb60f51ec25f3ccf36 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Tue, 6 May 2025 01:18:15 -0700 Subject: [PATCH 5/6] Model: Check for unsupported cache mode in exllamav2 --- backends/exllamav2/model.py | 19 ++++++++++++++++++- common/config_models.py | 1 - 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 5d79e4f..22200ec 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -185,9 +185,17 @@ class ExllamaV2Container(BaseModelContainer): # MARK: User configuration # Get cache mode - # TODO: Separate validation for Exl2 and Exl3 q-cache options self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") + # Catch exllamav3 cache_mode + if not self.cache_mode.startswith("Q"): + logger.warning( + f"Provided cache mode '{self.cache_mode}' is not a " + "valid choice for exllamav2, please check your settings. " + "Defaulting to FP16." + ) + self.cache_mode = "FP16" + # Turn off GPU split if the user is using 1 GPU gpu_count = torch.cuda.device_count() gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) @@ -393,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer): # Set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") + # Catch exllamav3 draft_cache_mode + if not self.draft_cache_mode.startswith("Q"): + logger.warning( + f"Provided draft cache mode '{self.draft_cache_mode}' is not a " + "valid choice for exllamav2, please check your settings. " + "Defaulting to FP16." + ) + self.draft_cache_mode = "FP16" + # Edit the draft config size if chunk_size: self.draft_config.max_input_len = chunk_size diff --git a/common/config_models.py b/common/config_models.py index b4a245e..e228552 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -229,7 +229,6 @@ class ModelConfig(BaseConfigModel): "or auto-calculate." ), ) - # TODO: Separate validation for Exl2 and Exl3 q-cache options cache_mode: Optional[CACHE_TYPE] = Field( "FP16", description=( From 638eef401acf02e1460e1bb2c01b277089b22378 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Thu, 8 May 2025 23:10:03 -0400 Subject: [PATCH 6/6] Model: Move cache creation to a common function Prevents repetitiveness while also creating a Cache class. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav3/model.py | 81 ++++++++++++++----------------------- 1 file changed, 31 insertions(+), 50 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 61986c1..c386a6e 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -168,7 +168,7 @@ class ExllamaV3Container(BaseModelContainer): logger.info(f"Using draft model: {str(draft_model_path.resolve())}") else: self.draft_model = None - self.craft_cache = None + self.draft_cache = None # Turn off GPU split if the user is using 1 GPU gpu_count = torch.cuda.device_count() @@ -222,61 +222,15 @@ class ExllamaV3Container(BaseModelContainer): user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len) self.cache_size = self.adjust_cache_size(user_cache_size) self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") - - # Alias Exl2 q-cache settings - match self.cache_mode: - case "Q4": - self.cache_mode = "4,4" - case "Q6": - self.cache_mode = "6,6" - case "Q8": - self.cache_mode = "8,8" - - split_cache_mode = re.search(r"^([2-8])\s*,\s*([2-8])$", self.cache_mode) - if split_cache_mode: - k_bits = int(split_cache_mode.group(1)) - v_bits = int(split_cache_mode.group(2)) - self.cache = Cache( - self.model, - max_num_tokens=self.cache_size, - layer_type=CacheLayer_quant, - k_bits=k_bits, - v_bits=v_bits, - ) - else: - self.cache = Cache(self.model, max_num_tokens=self.cache_size) + self.cache = self.create_cache(self.cache_mode, self.model) # Draft cache if self.use_draft_model: # Set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") - - # Alias Exl2 q-cache settings - match self.draft_cache_mode: - case "Q4": - self.draft_cache_mode = "4,4" - case "Q6": - self.draft_cache_mode = "6,6" - case "Q8": - self.draft_cache_mode = "8,8" - - split_draft_cache_mode = re.search( - r"^([2-8])\s*,\s*([2-8])$", self.draft_cache_mode + self.draft_cache = self.create_cache( + self.draft_cache_mode, self.draft_model ) - if split_draft_cache_mode: - draft_k_bits = int(split_draft_cache_mode.group(1)) - draft_v_bits = int(split_draft_cache_mode.group(2)) - self.draft_cache = Cache( - self.draft_model, - max_num_tokens=self.cache_size, - layer_type=CacheLayer_quant, - k_bits=draft_k_bits, - v_bits=draft_v_bits, - ) - else: - self.draft_cache = Cache( - self.draft_model, max_num_tokens=self.cache_size - ) # Max batch size self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256) @@ -355,6 +309,33 @@ class ExllamaV3Container(BaseModelContainer): return chunk_size + def create_cache(self, raw_cache_mode: str, model: Model): + # Cast exl2 types to exl3 + match raw_cache_mode: + case "Q4": + raw_cache_mode = "4,4" + case "Q6": + raw_cache_mode = "6,6" + case "Q8": + raw_cache_mode = "8,8" + + split_cache_mode = re.search(r"^([2-8])\s*,\s*([2-8])$", raw_cache_mode) + + if split_cache_mode: + draft_k_bits = int(split_cache_mode.group(1)) + draft_v_bits = int(split_cache_mode.group(2)) + cache = Cache( + model, + max_num_tokens=self.cache_size, + layer_type=CacheLayer_quant, + k_bits=draft_k_bits, + v_bits=draft_v_bits, + ) + else: + cache = Cache(model, max_num_tokens=self.cache_size) + + return cache + def model_info(self) -> ModelCard: """ Returns a dictionary of the current model's configuration parameters.