From a0c16bba2ac35739a1881973068f905e5ffd6c63 Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sun, 15 Jun 2025 16:51:42 +0200
Subject: [PATCH 1/6] Exl2: Fix banned_strings (move outside of
 assign_gen_params)

---
 backends/exllamav2/model.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 29caa34..fa96251 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -1001,7 +1001,6 @@ class ExllamaV2Container(BaseModelContainer):
         params: BaseSamplerRequest,
         gen_settings: ExLlamaV2Sampler.Settings,
         grammar_handler: ExLlamaV2Grammar,
-        banned_strings: List[str],
     ):
         # Apply settings
         gen_settings.temperature = params.temperature
@@ -1109,16 +1108,6 @@ class ExllamaV2Container(BaseModelContainer):
                 params.grammar_string, self.model, self.tokenizer
             )
 
-        # Set banned strings
-        banned_strings = params.banned_strings
-        if banned_strings and len(grammar_handler.filters) > 0:
-            logger.warning(
-                "Disabling banned_strings because "
-                "they cannot be used with grammar filters."
-            )
-
-            banned_strings = []
-
         # Speculative Ngram
         self.generator.speculative_ngram = params.speculative_ngram
 
@@ -1224,15 +1213,23 @@ class ExllamaV2Container(BaseModelContainer):
         prompts = [prompt]
         gen_settings = ExLlamaV2Sampler.Settings()
         grammar_handler = ExLlamaV2Grammar()
-        banned_strings = []
 
         self.assign_gen_params(
             params,
             gen_settings,
             grammar_handler,
-            banned_strings,
         )
 
+        # Set banned strings
+        banned_strings = params.banned_strings
+        if banned_strings and len(grammar_handler.filters) > 0:
+            logger.warning(
+                "Disabling banned_strings because "
+                "they cannot be used with grammar filters."
+            )
+
+            banned_strings = []
+
         # Set CFG scale and negative prompt
         cfg_scale = params.cfg_scale
         negative_prompt = None

From d357f100d0bf8863d64ae1703d43aeca56fbc122 Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sun, 15 Jun 2025 19:12:45 +0200
Subject: [PATCH 2/6] Dependencies: Bump ExllamaV3

---
 backends/exllamav3/model.py |  2 +-
 pyproject.toml              | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index 9e69074..24c08f3 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -99,7 +99,7 @@ class ExllamaV3Container(BaseModelContainer):
         self = cls()
 
         # Make sure ExllamaV3 is up to date
-        check_package_version("exllamav3", "0.0.3")
+        check_package_version("exllamav3", "0.0.4")
 
         logger.warning(
             "ExllamaV3 is currently in an alpha state. "
diff --git a/pyproject.toml b/pyproject.toml
index ca69823..6a3a6ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,14 +78,14 @@ cu121 = [
     "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Exl3
-    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu128.torch2.7.0-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",
-    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu128.torch2.7.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu128.torch2.7.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu128.torch2.7.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu128.torch2.7.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
-    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu128.torch2.7.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu128.torch2.7.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",
+    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
+    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Windows FA2 from https://github.com/kingbri1/flash-attention/releases
     "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",

From 4605c0f6bd049941b19b4f29773bb6ce9b8cf80f Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sun, 15 Jun 2025 19:20:36 +0200
Subject: [PATCH 3/6] Common: Refactor get_image to common functions

---
 backends/exllamav2/vision.py | 51 +----------------------------------
 common/image_util.py         | 52 ++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 50 deletions(-)
 create mode 100644 common/image_util.py

diff --git a/backends/exllamav2/vision.py b/backends/exllamav2/vision.py
index 7db0b09..8432eb5 100644
--- a/backends/exllamav2/vision.py
+++ b/backends/exllamav2/vision.py
@@ -1,19 +1,10 @@
 """Vision utilities for ExLlamaV2."""
 
-import aiohttp
-import base64
-import io
-import re
 from async_lru import alru_cache
-from fastapi import HTTPException
-from PIL import Image
 
 from common import model
-from common.networking import (
-    handle_request_error,
-)
 from common.optional_dependencies import dependencies
-from common.tabby_config import config
+from common.image_util import get_image
 
 # Since this is used outside the Exl2 backend, the dependency
 # may be optional
@@ -21,46 +12,6 @@ if dependencies.exllamav2:
     from exllamav2.generator import ExLlamaV2MMEmbedding
 
 
-async def get_image(url: str) -> Image:
-    if url.startswith("data:image"):
-        # Handle base64 image
-        match = re.match(r"^data:image\/[a-zA-Z0-9]+;base64,(.*)$", url)
-        if match:
-            base64_image = match.group(1)
-            bytes_image = base64.b64decode(base64_image)
-        else:
-            error_message = handle_request_error(
-                "Failed to read base64 image input.",
-                exc_info=False,
-            ).error.message
-
-            raise HTTPException(400, error_message)
-
-    else:
-        # Handle image URL
-        if config.network.disable_fetch_requests:
-            error_message = handle_request_error(
-                f"Failed to fetch image from {url} as fetch requests are disabled.",
-                exc_info=False,
-            ).error.message
-
-            raise HTTPException(400, error_message)
-
-        async with aiohttp.ClientSession() as session:
-            async with session.get(url) as response:
-                if response.status == 200:
-                    bytes_image = await response.read()
-                else:
-                    error_message = handle_request_error(
-                        f"Failed to fetch image from {url}.",
-                        exc_info=False,
-                    ).error.message
-
-                    raise HTTPException(400, error_message)
-
-    return Image.open(io.BytesIO(bytes_image))
-
-
 # Fetch the return type on runtime
 @alru_cache(20)
 async def get_image_embedding(url: str) -> "ExLlamaV2MMEmbedding":
diff --git a/common/image_util.py b/common/image_util.py
new file mode 100644
index 0000000..9790cfe
--- /dev/null
+++ b/common/image_util.py
@@ -0,0 +1,52 @@
+import aiohttp
+import base64
+import io
+import re
+
+from fastapi import HTTPException
+from PIL import Image
+
+from common.networking import (
+    handle_request_error,
+)
+from common.tabby_config import config
+
+
+async def get_image(url: str) -> Image:
+    if url.startswith("data:image"):
+        # Handle base64 image
+        match = re.match(r"^data:image\/[a-zA-Z0-9]+;base64,(.*)$", url)
+        if match:
+            base64_image = match.group(1)
+            bytes_image = base64.b64decode(base64_image)
+        else:
+            error_message = handle_request_error(
+                "Failed to read base64 image input.",
+                exc_info=False,
+            ).error.message
+
+            raise HTTPException(400, error_message)
+
+    else:
+        # Handle image URL
+        if config.network.disable_fetch_requests:
+            error_message = handle_request_error(
+                f"Failed to fetch image from {url} as fetch requests are disabled.",
+                exc_info=False,
+            ).error.message
+
+            raise HTTPException(400, error_message)
+
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                if response.status == 200:
+                    bytes_image = await response.read()
+                else:
+                    error_message = handle_request_error(
+                        f"Failed to fetch image from {url}.",
+                        exc_info=False,
+                    ).error.message
+
+                    raise HTTPException(400, error_message)
+
+    return Image.open(io.BytesIO(bytes_image))

From 1c9891bf04f1128b1b6526ad8d035905050cbe1a Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sun, 15 Jun 2025 19:22:51 +0200
Subject: [PATCH 4/6] Exl3: Add vision capability

---
 backends/exllamav2/vision.py |  4 ++--
 backends/exllamav3/model.py  | 31 +++++++++++++++++++++++++++++++
 backends/exllamav3/vision.py | 27 +++++++++++++++++++++++++++
 common/multimodal.py         | 25 ++++++++++++++++++++-----
 4 files changed, 80 insertions(+), 7 deletions(-)
 create mode 100644 backends/exllamav3/vision.py

diff --git a/backends/exllamav2/vision.py b/backends/exllamav2/vision.py
index 8432eb5..90106e3 100644
--- a/backends/exllamav2/vision.py
+++ b/backends/exllamav2/vision.py
@@ -14,7 +14,7 @@ if dependencies.exllamav2:
 
 # Fetch the return type on runtime
 @alru_cache(20)
-async def get_image_embedding(url: str) -> "ExLlamaV2MMEmbedding":
+async def get_image_embedding_exl2(url: str) -> "ExLlamaV2MMEmbedding":
     image = await get_image(url)
     return model.container.vision_model.get_image_embeddings(
         model=model.container.model,
@@ -25,4 +25,4 @@ async def get_image_embedding(url: str) -> "ExLlamaV2MMEmbedding":
 
 
 def clear_image_embedding_cache():
-    get_image_embedding.cache_clear()
+    get_image_embedding_exl2.cache_clear()
diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index 24c08f3..c8761ff 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -69,6 +69,7 @@ class ExllamaV3Container(BaseModelContainer):
     config: Optional[Config] = None
     draft_config: Optional[Config] = None
     generator: Optional[AsyncGenerator] = None
+    vision_model: Optional[Model] = None
 
     # Class-specific vars
     gpu_split: Optional[List[float]] = None
@@ -112,6 +113,19 @@ class ExllamaV3Container(BaseModelContainer):
         self.model = Model.from_config(self.config)
         self.tokenizer = Tokenizer.from_config(self.config)
 
+        # Prepare vision model if requested in config
+        self.use_vision = kwargs.get("vision")
+        if self.use_vision and "vision" in self.config.model_classes:
+            self.vision_model = Model.from_config(self.config, component="vision")
+        else:
+            logger.warning(
+                "The provided model does not have vision capabilities that are "
+                "supported by ExllamaV3. "
+                "Vision input is disabled."
+            )
+            self.vision_model = None
+            self.use_vision = False
+
         # Fallback to 4096 since exl3 can't fetch from HF's config.json
         self.max_seq_len = unwrap(kwargs.get("max_seq_len"), 4096)
 
@@ -418,6 +432,14 @@ class ExllamaV3Container(BaseModelContainer):
 
     @torch.inference_mode()
     def load_model_sync(self, progress_callback=None):
+        if self.use_vision:
+            for value in self.vision_model.load_gen(
+                reserve_per_device=self.autosplit_reserve,
+                callback=progress_callback
+            ):
+                if value:
+                    yield value
+
         if self.use_draft_model:
             for value in self.draft_model.load_gen(
                 reserve_per_device=self.autosplit_reserve,
@@ -527,6 +549,9 @@ class ExllamaV3Container(BaseModelContainer):
             A list of integer token IDs.
         """
 
+        mm_embeddings: MultimodalEmbeddingWrapper = kwargs.get("embeddings")
+        mm_embeddings_content = mm_embeddings.content if mm_embeddings else []
+
         return (
             self.tokenizer.encode(
                 text,
@@ -534,6 +559,7 @@ class ExllamaV3Container(BaseModelContainer):
                     kwargs.get("add_bos_token"), self.hf_model.add_bos_token()
                 ),
                 encode_special_tokens=unwrap(kwargs.get("encode_special_tokens"), True),
+                embeddings=mm_embeddings_content
             )
             .flatten()
             .tolist()
@@ -802,6 +828,9 @@ class ExllamaV3Container(BaseModelContainer):
         stop_conditions = params.stop
         add_bos_token = unwrap(params.add_bos_token, self.hf_model.add_bos_token())
 
+        # Get multimodal embeddings if present
+        mm_embeddings_content = mm_embeddings.content if mm_embeddings else []
+
         # Fetch EOS tokens from generation_config if they exist
         eos_tokens = self.hf_model.eos_tokens() or [self.tokenizer.eos_token_id]
 
@@ -812,6 +841,7 @@ class ExllamaV3Container(BaseModelContainer):
                 prompt,
                 add_bos=add_bos_token,
                 encode_special_tokens=True,
+                embeddings=mm_embeddings_content,
             )
             for prompt in prompts
         ]
@@ -855,6 +885,7 @@ class ExllamaV3Container(BaseModelContainer):
             max_new_tokens=max_tokens,
             stop_conditions=stop_conditions,
             banned_strings=params.banned_strings,
+            embeddings=mm_embeddings_content,
         )
 
         generated_tokens = 0
diff --git a/backends/exllamav3/vision.py b/backends/exllamav3/vision.py
new file mode 100644
index 0000000..c59d881
--- /dev/null
+++ b/backends/exllamav3/vision.py
@@ -0,0 +1,27 @@
+"""Vision utilities for ExLlamaV2."""
+
+from async_lru import alru_cache
+
+from common import model
+from common.optional_dependencies import dependencies
+from common.image_util import get_image
+
+# Since this is used outside the Exl3 backend, the dependency
+# may be optional
+if dependencies.exllamav3:
+    from exllamav3.tokenizer import MMEmbedding
+
+
+# Fetch the return type on runtime
+@alru_cache(20)
+async def get_image_embedding_exl3(url: str) -> "MMEmbedding":
+    image = await get_image(url)
+    return model.container.vision_model.get_image_embeddings(
+        tokenizer=model.container.tokenizer,
+        image=image,
+        text_alias=None,
+    )
+
+
+def clear_image_embedding_cache():
+    get_image_embedding_exl3.cache_clear()
diff --git a/common/multimodal.py b/common/multimodal.py
index dee865f..8b21587 100644
--- a/common/multimodal.py
+++ b/common/multimodal.py
@@ -1,4 +1,5 @@
-from backends.exllamav2.vision import get_image_embedding
+from backends.exllamav2.vision import get_image_embedding_exl2
+from backends.exllamav3.vision import get_image_embedding_exl3
 from common import model
 from loguru import logger
 from pydantic import BaseModel, Field
@@ -8,7 +9,8 @@ from common.optional_dependencies import dependencies
 
 if dependencies.exllamav2:
     from exllamav2 import ExLlamaV2VisionTower
-
+if dependencies.exllamav3:
+    from exllamav3 import Model
 
 class MultimodalEmbeddingWrapper(BaseModel):
     """Common multimodal embedding wrapper"""
@@ -20,12 +22,25 @@ class MultimodalEmbeddingWrapper(BaseModel):
     async def add(self, url: str):
         # Determine the type of vision embedding to use
         if not self.type:
-            if isinstance(model.container.vision_model, ExLlamaV2VisionTower):
+            if (
+                dependencies.exllamav2 and
+                isinstance(model.container.vision_model, ExLlamaV2VisionTower)
+            ):
                 self.type = "ExLlamaV2MMEmbedding"
+            elif (
+                dependencies.exllamav3 and
+                isinstance(model.container.vision_model, Model)
+            ):
+                self.type = "MMEmbedding"
 
+        # Create the embedding
         if self.type == "ExLlamaV2MMEmbedding":
-            embedding = await get_image_embedding(url)
+            embedding = await get_image_embedding_exl2(url)
+            self.content.append(embedding)
+            self.text_alias.append(embedding.text_alias)
+        elif self.type == "MMEmbedding":
+            embedding = await get_image_embedding_exl3(url)
             self.content.append(embedding)
             self.text_alias.append(embedding.text_alias)
         else:
-            logger.error("No valid vision model to create embedding")
+            logger.error("No valid vision model to create embedding")
\ No newline at end of file

From 21c5af48e12644e1e9bb97aac10029bc7c2e57bf Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sun, 15 Jun 2025 19:30:38 +0200
Subject: [PATCH 5/6] Tree: Format

---
 backends/exllamav3/model.py |  4 ++--
 common/multimodal.py        | 12 +++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index c8761ff..ee2dfaa 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -435,7 +435,7 @@ class ExllamaV3Container(BaseModelContainer):
         if self.use_vision:
             for value in self.vision_model.load_gen(
                 reserve_per_device=self.autosplit_reserve,
-                callback=progress_callback
+                callback=progress_callback,
             ):
                 if value:
                     yield value
@@ -559,7 +559,7 @@ class ExllamaV3Container(BaseModelContainer):
                     kwargs.get("add_bos_token"), self.hf_model.add_bos_token()
                 ),
                 encode_special_tokens=unwrap(kwargs.get("encode_special_tokens"), True),
-                embeddings=mm_embeddings_content
+                embeddings=mm_embeddings_content,
             )
             .flatten()
             .tolist()
diff --git a/common/multimodal.py b/common/multimodal.py
index 8b21587..ec5fb6f 100644
--- a/common/multimodal.py
+++ b/common/multimodal.py
@@ -22,14 +22,12 @@ class MultimodalEmbeddingWrapper(BaseModel):
     async def add(self, url: str):
         # Determine the type of vision embedding to use
         if not self.type:
-            if (
-                dependencies.exllamav2 and
-                isinstance(model.container.vision_model, ExLlamaV2VisionTower)
+            if dependencies.exllamav2 and isinstance(
+                model.container.vision_model, ExLlamaV2VisionTower
             ):
                 self.type = "ExLlamaV2MMEmbedding"
-            elif (
-                dependencies.exllamav3 and
-                isinstance(model.container.vision_model, Model)
+            elif dependencies.exllamav3 and isinstance(
+                model.container.vision_model, Model
             ):
                 self.type = "MMEmbedding"
 
@@ -43,4 +41,4 @@ class MultimodalEmbeddingWrapper(BaseModel):
             self.content.append(embedding)
             self.text_alias.append(embedding.text_alias)
         else:
-            logger.error("No valid vision model to create embedding")
\ No newline at end of file
+            logger.error("No valid vision model to create embedding")

From 122d87ac363b6004f2221bb1daf2533154edd42a Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Sun, 15 Jun 2025 19:33:14 +0200
Subject: [PATCH 6/6] Tree: Format

---
 common/multimodal.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/multimodal.py b/common/multimodal.py
index ec5fb6f..b92386f 100644
--- a/common/multimodal.py
+++ b/common/multimodal.py
@@ -12,6 +12,7 @@ if dependencies.exllamav2:
 if dependencies.exllamav3:
     from exllamav3 import Model
 
+
 class MultimodalEmbeddingWrapper(BaseModel):
     """Common multimodal embedding wrapper"""