Merge branch 'exl3' into backend-detect

This commit is contained in:
Brian 2025-05-08 23:50:33 -04:00 committed by GitHub
commit 02a8d68e17
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 68 additions and 8 deletions

View file

@ -187,6 +187,15 @@ class ExllamaV2Container(BaseModelContainer):
# Get cache mode # Get cache mode
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
# Catch exllamav3 cache_mode
if not self.cache_mode.startswith("Q"):
logger.warning(
f"Provided cache mode '{self.cache_mode}' is not a "
"valid choice for exllamav2, please check your settings. "
"Defaulting to FP16."
)
self.cache_mode = "FP16"
# Turn off GPU split if the user is using 1 GPU # Turn off GPU split if the user is using 1 GPU
gpu_count = torch.cuda.device_count() gpu_count = torch.cuda.device_count()
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
@ -392,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer):
# Set draft cache mode # Set draft cache mode
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
# Catch exllamav3 draft_cache_mode
if not self.draft_cache_mode.startswith("Q"):
logger.warning(
f"Provided draft cache mode '{self.draft_cache_mode}' is not a "
"valid choice for exllamav2, please check your settings. "
"Defaulting to FP16."
)
self.draft_cache_mode = "FP16"
# Edit the draft config size # Edit the draft config size
if chunk_size: if chunk_size:
self.draft_config.max_input_len = chunk_size self.draft_config.max_input_len = chunk_size

View file

@ -1,6 +1,7 @@
import asyncio import asyncio
import gc import gc
import pathlib import pathlib
import re
import traceback import traceback
from typing import ( from typing import (
Any, Any,
@ -19,6 +20,7 @@ from exllamav3 import (
Model, Model,
Tokenizer, Tokenizer,
) )
from exllamav3.cache import CacheLayer_quant
from loguru import logger from loguru import logger
from backends.base_model_container import BaseModelContainer from backends.base_model_container import BaseModelContainer
@ -73,6 +75,8 @@ class ExllamaV3Container(BaseModelContainer):
use_tp: bool = False use_tp: bool = False
max_seq_len: int = 4096 max_seq_len: int = 4096
cache_size: int = 4096 cache_size: int = 4096
cache_mode: str = "FP16"
draft_cache_mode: str = "FP16"
chunk_size: int = 2048 chunk_size: int = 2048
max_batch_size: Optional[int] = None max_batch_size: Optional[int] = None
@ -164,7 +168,7 @@ class ExllamaV3Container(BaseModelContainer):
logger.info(f"Using draft model: {str(draft_model_path.resolve())}") logger.info(f"Using draft model: {str(draft_model_path.resolve())}")
else: else:
self.draft_model = None self.draft_model = None
self.craft_cache = None self.draft_cache = None
# Turn off GPU split if the user is using 1 GPU # Turn off GPU split if the user is using 1 GPU
gpu_count = torch.cuda.device_count() gpu_count = torch.cuda.device_count()
@ -217,11 +221,16 @@ class ExllamaV3Container(BaseModelContainer):
# Cache # Cache
user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len) user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len)
self.cache_size = self.adjust_cache_size(user_cache_size) self.cache_size = self.adjust_cache_size(user_cache_size)
self.cache = Cache(self.model, max_num_tokens=self.cache_size) self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
self.cache = self.create_cache(self.cache_mode, self.model)
# Draft cache # Draft cache
if self.use_draft_model: if self.use_draft_model:
self.draft_cache = Cache(self.draft_model, max_num_tokens=self.cache_size) # Set draft cache mode
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
self.draft_cache = self.create_cache(
self.draft_cache_mode, self.draft_model
)
# Max batch size # Max batch size
self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256) self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256)
@ -300,6 +309,33 @@ class ExllamaV3Container(BaseModelContainer):
return chunk_size return chunk_size
def create_cache(self, raw_cache_mode: str, model: Model):
# Cast exl2 types to exl3
match raw_cache_mode:
case "Q4":
raw_cache_mode = "4,4"
case "Q6":
raw_cache_mode = "6,6"
case "Q8":
raw_cache_mode = "8,8"
split_cache_mode = re.search(r"^([2-8])\s*,\s*([2-8])$", raw_cache_mode)
if split_cache_mode:
draft_k_bits = int(split_cache_mode.group(1))
draft_v_bits = int(split_cache_mode.group(2))
cache = Cache(
model,
max_num_tokens=self.cache_size,
layer_type=CacheLayer_quant,
k_bits=draft_k_bits,
v_bits=draft_v_bits,
)
else:
cache = Cache(model, max_num_tokens=self.cache_size)
return cache
def model_info(self) -> ModelCard: def model_info(self) -> ModelCard:
""" """
Returns a dictionary of the current model's configuration parameters. Returns a dictionary of the current model's configuration parameters.
@ -312,7 +348,7 @@ class ExllamaV3Container(BaseModelContainer):
max_seq_len=self.max_seq_len, max_seq_len=self.max_seq_len,
cache_size=self.cache_size, cache_size=self.cache_size,
max_batch_size=self.max_batch_size, max_batch_size=self.max_batch_size,
# cache_mode=self.cache_mode, cache_mode=self.cache_mode,
chunk_size=self.chunk_size, chunk_size=self.chunk_size,
use_vision=self.use_vision, use_vision=self.use_vision,
) )

View file

@ -1,6 +1,7 @@
from pydantic import ( from pydantic import (
BaseModel, BaseModel,
ConfigDict, ConfigDict,
constr,
Field, Field,
PrivateAttr, PrivateAttr,
field_validator, field_validator,
@ -9,6 +10,7 @@ from typing import List, Literal, Optional, Union
CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"] CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8]\s*,\s*[2-8]$")]
class Metadata(BaseModel): class Metadata(BaseModel):
@ -225,11 +227,13 @@ class ModelConfig(BaseConfigModel):
"or auto-calculate." "or auto-calculate."
), ),
) )
cache_mode: Optional[CACHE_SIZES] = Field( cache_mode: Optional[CACHE_TYPE] = Field(
"FP16", "FP16",
description=( description=(
"Enable different cache modes for VRAM savings (default: FP16).\n" "Enable different cache modes for VRAM savings (default: FP16).\n"
f"Possible values: {str(CACHE_SIZES)[15:-1]}." f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
"For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
"are integers from 2-8 (i.e. 8,8)."
), ),
) )
cache_size: Optional[int] = Field( cache_size: Optional[int] = Field(

View file

@ -114,7 +114,8 @@ model:
rope_alpha: rope_alpha:
# Enable different cache modes for VRAM savings (default: FP16). # Enable different cache modes for VRAM savings (default: FP16).
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
cache_mode: FP16 cache_mode: FP16
# Size of the prompt cache to allocate (default: max_seq_len). # Size of the prompt cache to allocate (default: max_seq_len).
@ -164,7 +165,8 @@ draft_model:
draft_rope_alpha: draft_rope_alpha:
# Cache mode for draft models to save VRAM (default: FP16). # Cache mode for draft models to save VRAM (default: FP16).
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
draft_cache_mode: FP16 draft_cache_mode: FP16
# An integer array of GBs of VRAM to split between GPUs (default: []). # An integer array of GBs of VRAM to split between GPUs (default: []).