From 7c6a05374786ee44f216feab337fc145723f5a08 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Sun, 27 Apr 2025 22:27:26 -0400 Subject: [PATCH] Model: Add option to select backend Changing the backend switches the container that's used. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- common/config_models.py | 7 +++++++ common/model.py | 28 +++++++++++++++++++++++----- config_sample.yml | 4 ++++ endpoints/core/types/model.py | 5 ++++- 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/common/config_models.py b/common/config_models.py index a31bd3e..bb09e57 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -163,6 +163,13 @@ class ModelConfig(BaseConfigModel): "Example: ['max_seq_len', 'cache_mode']." ), ) + backend: Optional[str] = Field( + "exllamav2", + description=( + "Backend to use for this model (default: exllamav2)\n" + "Options: exllamav2, exllamav3", + ), + ) max_seq_len: Optional[int] = Field( None, description=( diff --git a/common/model.py b/common/model.py index cc26b43..96d45f6 100644 --- a/common/model.py +++ b/common/model.py @@ -23,10 +23,14 @@ from common.utils import unwrap container: Optional[BaseModelContainer] = None embeddings_container = None -# FIXME: Possibly use this solely when creating the model + +_BACKEND_REGISTRY = {} + if dependencies.exllamav2: from backends.exllamav2.model import ExllamaV2Container + _BACKEND_REGISTRY["exllamav2"] = ExllamaV2Container + if dependencies.extras: from backends.infinity.model import InfinityContainer @@ -113,10 +117,24 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): kwargs = {**config.model_defaults, **kwargs} kwargs = await apply_inline_overrides(model_path, **kwargs) - # Create a new container - new_container = await ExllamaV2Container.create( - model_path.resolve(), False, **kwargs - ) + # Create a new container and check if the right dependencies are installed + backend_name = unwrap(kwargs.get("backend"), "exllamav2").lower() + container_class = _BACKEND_REGISTRY.get(backend_name) + + if not container_class: + available_backends = list(_BACKEND_REGISTRY.keys()) + if backend_name in available_backends: + raise ValueError( + f"Backend '{backend_name}' selected, but required dependencies " + "are not installed." + ) + else: + raise ValueError( + f"Invalid backend '{backend_name}'. " + "Available backends: {available_backends}" + ) + + new_container = await container_class.create(model_path.resolve(), False, **kwargs) # Add possible types of models that can be loaded model_type = [ModelType.MODEL] diff --git a/config_sample.yml b/config_sample.yml index b6f362d..a743c2c 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -74,6 +74,10 @@ model: # Example: ['max_seq_len', 'cache_mode']. use_as_default: [] + # Backend to use for the model (default: exllamav2) + # Options: exllamav2, exllamav3 + backend: exllamav2 + # Max sequence length (default: Empty). # Fetched from the model's base sequence length in config.json by default. max_seq_len: diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 02213f9..6855108 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -81,7 +81,10 @@ class ModelLoadRequest(BaseModel): ) # Config arguments - + backend: Optional[str] = Field( + description="Backend to use", + default="exllamav2", + ) max_seq_len: Optional[int] = Field( description="Leave this blank to use the model's base sequence length", default=None,