From f8070e77077f360631179c896808e6fb4658297f Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Tue, 6 May 2025 18:51:58 -0700 Subject: [PATCH 1/3] Model: Auto detect model backend from config * Use exllamav3 for exl3 models, exllamav2 otherwise --- backends/exllamav3/model.py | 6 ++---- common/config_models.py | 2 +- common/model.py | 19 ++++++++++++++++++- common/transformers_utils.py | 12 +++++++++++- config_sample.yml | 4 ++-- endpoints/OAI/types/chat_completion.py | 2 +- 6 files changed, 35 insertions(+), 10 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index fc10a3d..f04c218 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -161,9 +161,7 @@ class ExllamaV3Container(BaseModelContainer): self.draft_model_dir = draft_model_path self.draft_config = Config.from_directory(str(draft_model_path.resolve())) self.draft_model = Model.from_config(self.draft_config) - logger.info( - f'Using draft model: {str(draft_model_path.resolve())}' - ) + logger.info(f"Using draft model: {str(draft_model_path.resolve())}") else: self.draft_model = None self.craft_cache = None @@ -223,7 +221,7 @@ class ExllamaV3Container(BaseModelContainer): # Draft cache if self.use_draft_model: - self.draft_cache = Cache(self.draft_model, max_num_tokens = self.cache_size) + self.draft_cache = Cache(self.draft_model, max_num_tokens=self.cache_size) # Max batch size self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256) diff --git a/common/config_models.py b/common/config_models.py index 9d88eb1..de1f803 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -166,7 +166,7 @@ class ModelConfig(BaseConfigModel): backend: Optional[str] = Field( None, description=( - "Backend to use for this model (default: exllamav2)\n" + "Backend to use for this model (auto-detect if not specified)\n" "Options: exllamav2, exllamav3" ), ) diff --git a/common/model.py b/common/model.py index 2c7bd65..161c7dc 100644 --- a/common/model.py +++ b/common/model.py @@ -17,6 +17,7 @@ from common.logger import get_loading_progress_bar from common.networking import handle_request_error from common.tabby_config import config from common.optional_dependencies import dependencies +from common.transformers_utils import HuggingFaceConfig from common.utils import unwrap # Global variables for model container @@ -123,8 +124,24 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): kwargs = {**config.model_defaults, **kwargs} kwargs = await apply_inline_overrides(model_path, **kwargs) + # Read config.json and detect the quant method + hf_config_path = model_path / "config.json" + if hf_config_path.exists(): + try: + hf_config = await HuggingFaceConfig.from_file(model_path) + except Exception as exc: + raise ValueError( + "Failed to read the model's config.json. " + f"Please check your model directory at {model_path}." + ) from exc + quant_method = hf_config.quant_method() + if quant_method == "exl3": + backend_name = "exllamav3" + else: + backend_name = "exllamav2" + # Create a new container and check if the right dependencies are installed - backend_name = unwrap(kwargs.get("backend"), "exllamav2").lower() + backend_name = unwrap(kwargs.get("backend"), backend_name).lower() container_class = _BACKEND_REGISTRY.get(backend_name) if not container_class: diff --git a/common/transformers_utils.py b/common/transformers_utils.py index 045312c..cd79f00 100644 --- a/common/transformers_utils.py +++ b/common/transformers_utils.py @@ -1,7 +1,7 @@ import aiofiles import json import pathlib -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union from pydantic import BaseModel @@ -42,6 +42,8 @@ class HuggingFaceConfig(BaseModel): Will be expanded as needed. """ + quantization_config: Optional[Dict] = None + @classmethod async def from_file(cls, model_directory: pathlib.Path): """Create an instance from a generation config file.""" @@ -54,6 +56,14 @@ class HuggingFaceConfig(BaseModel): hf_config_dict = json.loads(contents) return cls.model_validate(hf_config_dict) + def quant_method(self): + """Wrapper method to fetch quant type""" + + if isinstance(self.quantization_config, Dict): + return self.quantization_config.get("quant_method") + else: + return None + class TokenizerConfig(BaseModel): """ diff --git a/config_sample.yml b/config_sample.yml index a743c2c..025d2ee 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -74,9 +74,9 @@ model: # Example: ['max_seq_len', 'cache_mode']. use_as_default: [] - # Backend to use for the model (default: exllamav2) + # Backend to use for this model (auto-detect if not specified) # Options: exllamav2, exllamav3 - backend: exllamav2 + backend: # Max sequence length (default: Empty). # Fetched from the model's base sequence length in config.json by default. diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py index 51695c2..fb73eb9 100644 --- a/endpoints/OAI/types/chat_completion.py +++ b/endpoints/OAI/types/chat_completion.py @@ -84,7 +84,7 @@ class ChatCompletionRequest(CommonCompletionRequest): # Chat completions requests do not have a BOS token preference. Backend # respects the tokenization config for the individual model. - add_bos_token: Optional[bool] = Field(default = None) + add_bos_token: Optional[bool] = Field(default=None) @field_validator("add_bos_token", mode="after") def force_bos_token(cls, v): From cfee16905bf63d7a6fa2bb33f12c4e878ab251d6 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Thu, 8 May 2025 23:42:39 -0400 Subject: [PATCH 2/3] Model: Migrate backend detection to a separate function Seemed out of place in the common load function. In addition, rename the transformers utils signature which actually takes a directory instead of a file. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- common/model.py | 39 ++++++++++++++++++++---------------- common/transformers_utils.py | 2 +- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/common/model.py b/common/model.py index 161c7dc..67af212 100644 --- a/common/model.py +++ b/common/model.py @@ -57,6 +57,24 @@ def load_progress(module, modules): yield module, modules +async def detect_backend(model_path: pathlib.Path) -> str: + """Determine the appropriate backend based on model files and configuration.""" + + try: + hf_config = await HuggingFaceConfig.from_directory(model_path) + quant_method = hf_config.quant_method() + + if quant_method == "exl3": + return "exllamav3" + else: + return "exllamav2" + except Exception as exc: + raise ValueError( + "Failed to read the model's config.json. " + f"Please check your model directory at {model_path}." + ) from exc + + async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs): """Sets overrides from a model folder's config yaml.""" @@ -124,24 +142,11 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): kwargs = {**config.model_defaults, **kwargs} kwargs = await apply_inline_overrides(model_path, **kwargs) - # Read config.json and detect the quant method - hf_config_path = model_path / "config.json" - if hf_config_path.exists(): - try: - hf_config = await HuggingFaceConfig.from_file(model_path) - except Exception as exc: - raise ValueError( - "Failed to read the model's config.json. " - f"Please check your model directory at {model_path}." - ) from exc - quant_method = hf_config.quant_method() - if quant_method == "exl3": - backend_name = "exllamav3" - else: - backend_name = "exllamav2" - # Create a new container and check if the right dependencies are installed - backend_name = unwrap(kwargs.get("backend"), backend_name).lower() + backend_name = unwrap( + kwargs.get("backend"), await detect_backend(model_path) + ).lower() + print(backend_name) container_class = _BACKEND_REGISTRY.get(backend_name) if not container_class: diff --git a/common/transformers_utils.py b/common/transformers_utils.py index cd79f00..d1e5ac1 100644 --- a/common/transformers_utils.py +++ b/common/transformers_utils.py @@ -45,7 +45,7 @@ class HuggingFaceConfig(BaseModel): quantization_config: Optional[Dict] = None @classmethod - async def from_file(cls, model_directory: pathlib.Path): + async def from_directory(cls, model_directory: pathlib.Path): """Create an instance from a generation config file.""" hf_config_path = model_directory / "config.json" From d5963007f026c959d5d3a8ecc86d55b02b4d07b4 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Thu, 8 May 2025 23:45:04 -0400 Subject: [PATCH 3/3] Model: Add backend print Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- common/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/model.py b/common/model.py index 67af212..9cdfdeb 100644 --- a/common/model.py +++ b/common/model.py @@ -146,7 +146,6 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): backend_name = unwrap( kwargs.get("backend"), await detect_backend(model_path) ).lower() - print(backend_name) container_class = _BACKEND_REGISTRY.get(backend_name) if not container_class: @@ -162,6 +161,7 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): f"Available backends: {available_backends}" ) + logger.info(f"Using backend {backend_name}") new_container: BaseModelContainer = await container_class.create( model_path.resolve(), **kwargs )