Merge branch 'exl3' into backend-detect

2025-05-08 23:50:33 -04:00 · 2025-05-08 23:50:33 -04:00 · 02a8d68e17
commit 02a8d68e17
parent d5963007f0 527afc206b
4 changed files with 68 additions and 8 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@ -187,6 +187,15 @@ class ExllamaV2Container(BaseModelContainer):
        # Get cache mode
        self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
        # Catch exllamav3 cache_mode
        if not self.cache_mode.startswith("Q"):
            logger.warning(
                f"Provided cache mode '{self.cache_mode}' is not a "
                "valid choice for exllamav2, please check your settings. "
                "Defaulting to FP16."
            )
            self.cache_mode = "FP16"
        # Turn off GPU split if the user is using 1 GPU
        gpu_count = torch.cuda.device_count()
        gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
@ -392,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer):
            # Set draft cache mode
            self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
            # Catch exllamav3 draft_cache_mode
            if not self.draft_cache_mode.startswith("Q"):
                logger.warning(
                    f"Provided draft cache mode '{self.draft_cache_mode}' is not a "
                    "valid choice for exllamav2, please check your settings. "
                    "Defaulting to FP16."
                )
                self.draft_cache_mode = "FP16"
            # Edit the draft config size
            if chunk_size:
                self.draft_config.max_input_len = chunk_size
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@ -1,6 +1,7 @@
 import asyncio
 import gc
 import pathlib
 import re
 import traceback
 from typing import (
    Any,
@ -19,6 +20,7 @@ from exllamav3 import (
    Model,
    Tokenizer,
 )
 from exllamav3.cache import CacheLayer_quant
 from loguru import logger
 from backends.base_model_container import BaseModelContainer
@ -73,6 +75,8 @@ class ExllamaV3Container(BaseModelContainer):
    use_tp: bool = False
    max_seq_len: int = 4096
    cache_size: int = 4096
    cache_mode: str = "FP16"
    draft_cache_mode: str = "FP16"
    chunk_size: int = 2048
    max_batch_size: Optional[int] = None
@ -164,7 +168,7 @@ class ExllamaV3Container(BaseModelContainer):
            logger.info(f"Using draft model: {str(draft_model_path.resolve())}")
        else:
            self.draft_model = None
-            self.craft_cache = None
+            self.draft_cache = None
        # Turn off GPU split if the user is using 1 GPU
        gpu_count = torch.cuda.device_count()
@ -217,11 +221,16 @@ class ExllamaV3Container(BaseModelContainer):
        # Cache
        user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len)
        self.cache_size = self.adjust_cache_size(user_cache_size)
-        self.cache = Cache(self.model, max_num_tokens=self.cache_size)
+        self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
        self.cache = self.create_cache(self.cache_mode, self.model)
        # Draft cache
        if self.use_draft_model:
-            self.draft_cache = Cache(self.draft_model, max_num_tokens=self.cache_size)
+            # Set draft cache mode
            self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
            self.draft_cache = self.create_cache(
                self.draft_cache_mode, self.draft_model
            )
        # Max batch size
        self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256)
@ -300,6 +309,33 @@ class ExllamaV3Container(BaseModelContainer):
        return chunk_size
    def create_cache(self, raw_cache_mode: str, model: Model):
        # Cast exl2 types to exl3
        match raw_cache_mode:
            case "Q4":
                raw_cache_mode = "4,4"
            case "Q6":
                raw_cache_mode = "6,6"
            case "Q8":
                raw_cache_mode = "8,8"
        split_cache_mode = re.search(r"^([2-8])\s*,\s*([2-8])$", raw_cache_mode)
        if split_cache_mode:
            draft_k_bits = int(split_cache_mode.group(1))
            draft_v_bits = int(split_cache_mode.group(2))
            cache = Cache(
                model,
                max_num_tokens=self.cache_size,
                layer_type=CacheLayer_quant,
                k_bits=draft_k_bits,
                v_bits=draft_v_bits,
            )
        else:
            cache = Cache(model, max_num_tokens=self.cache_size)
        return cache
    def model_info(self) -> ModelCard:
        """
        Returns a dictionary of the current model's configuration parameters.
@ -312,7 +348,7 @@ class ExllamaV3Container(BaseModelContainer):
            max_seq_len=self.max_seq_len,
            cache_size=self.cache_size,
            max_batch_size=self.max_batch_size,
-            # cache_mode=self.cache_mode,
+            cache_mode=self.cache_mode,
            chunk_size=self.chunk_size,
            use_vision=self.use_vision,
        )
--- a/common/config_models.py
+++ b/common/config_models.py
@ -1,6 +1,7 @@
 from pydantic import (
    BaseModel,
    ConfigDict,
    constr,
    Field,
    PrivateAttr,
    field_validator,
@ -9,6 +10,7 @@ from typing import List, Literal, Optional, Union
 CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
 CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8]\s*,\s*[2-8]$")]
 class Metadata(BaseModel):
@ -225,11 +227,13 @@ class ModelConfig(BaseConfigModel):
            "or auto-calculate."
        ),
    )
-    cache_mode: Optional[CACHE_SIZES] = Field(
+    cache_mode: Optional[CACHE_TYPE] = Field(
        "FP16",
        description=(
            "Enable different cache modes for VRAM savings (default: FP16).\n"
-            f"Possible values: {str(CACHE_SIZES)[15:-1]}."
+            f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
            "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
            "are integers from 2-8 (i.e. 8,8)."
        ),
    )
    cache_size: Optional[int] = Field(
--- a/config_sample.yml
+++ b/config_sample.yml
@ -114,7 +114,8 @@ model:
  rope_alpha:
  # Enable different cache modes for VRAM savings (default: FP16).
-  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
  cache_mode: FP16
  # Size of the prompt cache to allocate (default: max_seq_len).
@ -164,7 +165,8 @@ draft_model:
  draft_rope_alpha:
  # Cache mode for draft models to save VRAM (default: FP16).
-  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
  draft_cache_mode: FP16
  # An integer array of GBs of VRAM to split between GPUs (default: []).