From 7c6a05374786ee44f216feab337fc145723f5a08 Mon Sep 17 00:00:00 2001
From: kingbri <8082010+kingbri1@users.noreply.github.com>
Date: Sun, 27 Apr 2025 22:27:26 -0400
Subject: [PATCH] Model: Add option to select backend

Changing the backend switches the container that's used.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
---
 common/config_models.py       |  7 +++++++
 common/model.py               | 28 +++++++++++++++++++++++-----
 config_sample.yml             |  4 ++++
 endpoints/core/types/model.py |  5 ++++-
 4 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/common/config_models.py b/common/config_models.py
index a31bd3e..bb09e57 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -163,6 +163,13 @@ class ModelConfig(BaseConfigModel):
             "Example: ['max_seq_len', 'cache_mode']."
         ),
     )
+    backend: Optional[str] = Field(
+        "exllamav2",
+        description=(
+            "Backend to use for this model (default: exllamav2)\n"
+            "Options: exllamav2, exllamav3",
+        ),
+    )
     max_seq_len: Optional[int] = Field(
         None,
         description=(
diff --git a/common/model.py b/common/model.py
index cc26b43..96d45f6 100644
--- a/common/model.py
+++ b/common/model.py
@@ -23,10 +23,14 @@ from common.utils import unwrap
 container: Optional[BaseModelContainer] = None
 embeddings_container = None
 
-# FIXME: Possibly use this solely when creating the model
+
+_BACKEND_REGISTRY = {}
+
 if dependencies.exllamav2:
     from backends.exllamav2.model import ExllamaV2Container
 
+    _BACKEND_REGISTRY["exllamav2"] = ExllamaV2Container
+
 
 if dependencies.extras:
     from backends.infinity.model import InfinityContainer
@@ -113,10 +117,24 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
     kwargs = {**config.model_defaults, **kwargs}
     kwargs = await apply_inline_overrides(model_path, **kwargs)
 
-    # Create a new container
-    new_container = await ExllamaV2Container.create(
-        model_path.resolve(), False, **kwargs
-    )
+    # Create a new container and check if the right dependencies are installed
+    backend_name = unwrap(kwargs.get("backend"), "exllamav2").lower()
+    container_class = _BACKEND_REGISTRY.get(backend_name)
+
+    if not container_class:
+        available_backends = list(_BACKEND_REGISTRY.keys())
+        if backend_name in available_backends:
+            raise ValueError(
+                f"Backend '{backend_name}' selected, but required dependencies "
+                "are not installed."
+            )
+        else:
+            raise ValueError(
+                f"Invalid backend '{backend_name}'. "
+                "Available backends: {available_backends}"
+            )
+
+    new_container = await container_class.create(model_path.resolve(), False, **kwargs)
 
     # Add possible types of models that can be loaded
     model_type = [ModelType.MODEL]
diff --git a/config_sample.yml b/config_sample.yml
index b6f362d..a743c2c 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -74,6 +74,10 @@ model:
   # Example: ['max_seq_len', 'cache_mode'].
   use_as_default: []
 
+  # Backend to use for the model (default: exllamav2)
+  # Options: exllamav2, exllamav3
+  backend: exllamav2
+
   # Max sequence length (default: Empty).
   # Fetched from the model's base sequence length in config.json by default.
   max_seq_len:
diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py
index 02213f9..6855108 100644
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -81,7 +81,10 @@ class ModelLoadRequest(BaseModel):
     )
 
     # Config arguments
-
+    backend: Optional[str] = Field(
+        description="Backend to use",
+        default="exllamav2",
+    )
     max_seq_len: Optional[int] = Field(
         description="Leave this blank to use the model's base sequence length",
         default=None,