Model: Check for unsupported cache mode in exllamav2
This commit is contained in:
parent
45b966363e
commit
9dcde59c57
2 changed files with 18 additions and 2 deletions
|
|
@ -185,9 +185,17 @@ class ExllamaV2Container(BaseModelContainer):
|
|||
# MARK: User configuration
|
||||
|
||||
# Get cache mode
|
||||
# TODO: Separate validation for Exl2 and Exl3 q-cache options
|
||||
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
|
||||
|
||||
# Catch exllamav3 cache_mode
|
||||
if not self.cache_mode.startswith("Q"):
|
||||
logger.warning(
|
||||
f"Provided cache mode '{self.cache_mode}' is not a "
|
||||
"valid choice for exllamav2, please check your settings. "
|
||||
"Defaulting to FP16."
|
||||
)
|
||||
self.cache_mode = "FP16"
|
||||
|
||||
# Turn off GPU split if the user is using 1 GPU
|
||||
gpu_count = torch.cuda.device_count()
|
||||
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||
|
|
@ -393,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer):
|
|||
# Set draft cache mode
|
||||
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
|
||||
|
||||
# Catch exllamav3 draft_cache_mode
|
||||
if not self.draft_cache_mode.startswith("Q"):
|
||||
logger.warning(
|
||||
f"Provided draft cache mode '{self.draft_cache_mode}' is not a "
|
||||
"valid choice for exllamav2, please check your settings. "
|
||||
"Defaulting to FP16."
|
||||
)
|
||||
self.draft_cache_mode = "FP16"
|
||||
|
||||
# Edit the draft config size
|
||||
if chunk_size:
|
||||
self.draft_config.max_input_len = chunk_size
|
||||
|
|
|
|||
|
|
@ -229,7 +229,6 @@ class ModelConfig(BaseConfigModel):
|
|||
"or auto-calculate."
|
||||
),
|
||||
)
|
||||
# TODO: Separate validation for Exl2 and Exl3 q-cache options
|
||||
cache_mode: Optional[CACHE_TYPE] = Field(
|
||||
"FP16",
|
||||
description=(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue