Merge pull request #329 from DocShotgun/exl3

Exllamav3 cache quantization
2025-05-08 23:11:45 -04:00 · 2025-05-08 23:11:45 -04:00 · 527afc206b
commit 527afc206b
parent bc0a84241a 638eef401a
5 changed files with 70 additions and 12 deletions
--- a/common/config_models.py
+++ b/common/config_models.py
@ -1,6 +1,7 @@
 from pydantic import (
    BaseModel,
    ConfigDict,
+    constr,
    Field,
    PrivateAttr,
    field_validator,
@ -9,6 +10,7 @@ from typing import List, Literal, Optional, Union


 CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
+CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8]\s*,\s*[2-8]$")]


 class Metadata(BaseModel):
@ -225,11 +227,13 @@ class ModelConfig(BaseConfigModel):
            "or auto-calculate."
        ),
    )
-    cache_mode: Optional[CACHE_SIZES] = Field(
+    cache_mode: Optional[CACHE_TYPE] = Field(
        "FP16",
        description=(
            "Enable different cache modes for VRAM savings (default: FP16).\n"
-            f"Possible values: {str(CACHE_SIZES)[15:-1]}."
+            f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
+            "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
+            "are integers from 2-8 (i.e. 8,8)."
        ),
    )
    cache_size: Optional[int] = Field(