Model: Check for unsupported cache mode in exllamav2

This commit is contained in:
DocShotgun 2025-05-06 01:18:15 -07:00
parent 45b966363e
commit 9dcde59c57
2 changed files with 18 additions and 2 deletions

View file

@ -185,9 +185,17 @@ class ExllamaV2Container(BaseModelContainer):
# MARK: User configuration
# Get cache mode
# TODO: Separate validation for Exl2 and Exl3 q-cache options
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
# Catch exllamav3 cache_mode
if not self.cache_mode.startswith("Q"):
logger.warning(
f"Provided cache mode '{self.cache_mode}' is not a "
"valid choice for exllamav2, please check your settings. "
"Defaulting to FP16."
)
self.cache_mode = "FP16"
# Turn off GPU split if the user is using 1 GPU
gpu_count = torch.cuda.device_count()
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
@ -393,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer):
# Set draft cache mode
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
# Catch exllamav3 draft_cache_mode
if not self.draft_cache_mode.startswith("Q"):
logger.warning(
f"Provided draft cache mode '{self.draft_cache_mode}' is not a "
"valid choice for exllamav2, please check your settings. "
"Defaulting to FP16."
)
self.draft_cache_mode = "FP16"
# Edit the draft config size
if chunk_size:
self.draft_config.max_input_len = chunk_size

View file

@ -229,7 +229,6 @@ class ModelConfig(BaseConfigModel):
"or auto-calculate."
),
)
# TODO: Separate validation for Exl2 and Exl3 q-cache options
cache_mode: Optional[CACHE_TYPE] = Field(
"FP16",
description=(