diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 5d79e4f..22200ec 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -185,9 +185,17 @@ class ExllamaV2Container(BaseModelContainer): # MARK: User configuration # Get cache mode - # TODO: Separate validation for Exl2 and Exl3 q-cache options self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") + # Catch exllamav3 cache_mode + if not self.cache_mode.startswith("Q"): + logger.warning( + f"Provided cache mode '{self.cache_mode}' is not a " + "valid choice for exllamav2, please check your settings. " + "Defaulting to FP16." + ) + self.cache_mode = "FP16" + # Turn off GPU split if the user is using 1 GPU gpu_count = torch.cuda.device_count() gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) @@ -393,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer): # Set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") + # Catch exllamav3 draft_cache_mode + if not self.draft_cache_mode.startswith("Q"): + logger.warning( + f"Provided draft cache mode '{self.draft_cache_mode}' is not a " + "valid choice for exllamav2, please check your settings. " + "Defaulting to FP16." + ) + self.draft_cache_mode = "FP16" + # Edit the draft config size if chunk_size: self.draft_config.max_input_len = chunk_size diff --git a/common/config_models.py b/common/config_models.py index b4a245e..e228552 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -229,7 +229,6 @@ class ModelConfig(BaseConfigModel): "or auto-calculate." ), ) - # TODO: Separate validation for Exl2 and Exl3 q-cache options cache_mode: Optional[CACHE_TYPE] = Field( "FP16", description=(