diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 27fe33f..5d79e4f 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -185,6 +185,7 @@ class ExllamaV2Container(BaseModelContainer): # MARK: User configuration # Get cache mode + # TODO: Separate validation for Exl2 and Exl3 q-cache options self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") # Turn off GPU split if the user is using 1 GPU diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index fc10a3d..18e04e8 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -1,6 +1,7 @@ import asyncio import gc import pathlib +import re import traceback from typing import ( Any, @@ -19,6 +20,7 @@ from exllamav3 import ( Model, Tokenizer, ) +from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant from loguru import logger from backends.base_model_container import BaseModelContainer @@ -73,6 +75,7 @@ class ExllamaV3Container(BaseModelContainer): use_tp: bool = False max_seq_len: int = 4096 cache_size: int = 4096 + cache_mode: str = "FP16" chunk_size: int = 2048 max_batch_size: Optional[int] = None @@ -219,7 +222,32 @@ class ExllamaV3Container(BaseModelContainer): # Cache user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len) self.cache_size = self.adjust_cache_size(user_cache_size) - self.cache = Cache(self.model, max_num_tokens=self.cache_size) + self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") + + # Alias Exl2 q-cache settings + match self.cache_mode: + case "Q4": + self.cache_mode = "4,4" + case "Q6": + self.cache_mode = "6,6" + case "Q8": + self.cache_mode = "8,8" + + split_cache_mode = re.search(r"^([2-8]),([2-8])$", self.cache_mode) + if split_cache_mode: + k_bits = int(split_cache_mode.group(1)) + v_bits = int(split_cache_mode.group(2)) + self.cache = Cache( + self.model, + max_num_tokens=self.cache_size, + layer_type=CacheLayer_quant, + k_bits=k_bits, + v_bits=v_bits, + ) + else: + self.cache = Cache( + self.model, max_num_tokens=self.cache_size, layer_type=CacheLayer_fp16 + ) # Draft cache if self.use_draft_model: @@ -314,7 +342,7 @@ class ExllamaV3Container(BaseModelContainer): max_seq_len=self.max_seq_len, cache_size=self.cache_size, max_batch_size=self.max_batch_size, - # cache_mode=self.cache_mode, + cache_mode=self.cache_mode, chunk_size=self.chunk_size, use_vision=self.use_vision, ) diff --git a/common/config_models.py b/common/config_models.py index 2965fe0..8ee4ff7 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -1,6 +1,7 @@ from pydantic import ( BaseModel, ConfigDict, + constr, Field, PrivateAttr, field_validator, @@ -9,6 +10,7 @@ from typing import List, Literal, Optional, Union CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"] +CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8],[2-8]$")] class Metadata(BaseModel): @@ -227,11 +229,14 @@ class ModelConfig(BaseConfigModel): "or auto-calculate." ), ) - cache_mode: Optional[CACHE_SIZES] = Field( + # TODO: Separate validation for Exl2 and Exl3 q-cache options + cache_mode: Optional[CACHE_TYPE] = Field( "FP16", description=( "Enable different cache modes for VRAM savings (default: FP16).\n" - f"Possible values: {str(CACHE_SIZES)[15:-1]}." + f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n" + "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits " + "are integers from 2-8 (i.e. 8,8)." ), ) cache_size: Optional[int] = Field( diff --git a/config_sample.yml b/config_sample.yml index a743c2c..045db51 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -114,7 +114,8 @@ model: rope_alpha: # Enable different cache modes for VRAM savings (default: FP16). - # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). cache_mode: FP16 # Size of the prompt cache to allocate (default: max_seq_len). @@ -164,7 +165,8 @@ draft_model: draft_rope_alpha: # Cache mode for draft models to save VRAM (default: FP16). - # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). draft_cache_mode: FP16 # An integer array of GBs of VRAM to split between GPUs (default: []).