Model: Fix autosplit reserve crash with GPU split

ExllamaV3 does not accept autosplit_reserve and gpu_split at the same time. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2025-05-17 00:51:14 -04:00 · 2025-05-17 00:51:14 -04:00 · 084916c04f
commit 084916c04f
parent 0858b6d4b2
1 changed files with 6 additions and 2 deletions
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@ -69,9 +69,9 @@ class ExllamaV3Container(BaseModelContainer):
    generator: Optional[AsyncGenerator] = None

    # Class-specific vars
-    gpu_split: List[float] | None = None
+    gpu_split: Optional[List[float]] = None
    gpu_split_auto: bool = True
-    autosplit_reserve: List[float] = [96 / 1024]
+    autosplit_reserve: Optional[List[float]] = [96 / 1024]
    use_tp: bool = False
    max_seq_len: int = 4096
    cache_size: int = 4096
@ -155,6 +155,10 @@ class ExllamaV3Container(BaseModelContainer):
            if gpu_split:
                self.gpu_split = gpu_split

+                # Causes crash if set with GPU split
+                # TODO: Remove when fixed in exllama upstream
+                self.autosplit_reserve = None
+
                gpu_device_list = [
                    device_idx
                    for device_idx, memory in enumerate(self.gpu_split)