From 084916c04f90add14e0e59fa1bc0b1aa6c87f130 Mon Sep 17 00:00:00 2001
From: kingbri <8082010+kingbri1@users.noreply.github.com>
Date: Sat, 17 May 2025 00:51:14 -0400
Subject: [PATCH] Model: Fix autosplit reserve crash with GPU split

ExllamaV3 does not accept autosplit_reserve and gpu_split at the same
time.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
---
 backends/exllamav3/model.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index 52536fe..d80ee56 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -69,9 +69,9 @@ class ExllamaV3Container(BaseModelContainer):
     generator: Optional[AsyncGenerator] = None
 
     # Class-specific vars
-    gpu_split: List[float] | None = None
+    gpu_split: Optional[List[float]] = None
     gpu_split_auto: bool = True
-    autosplit_reserve: List[float] = [96 / 1024]
+    autosplit_reserve: Optional[List[float]] = [96 / 1024]
     use_tp: bool = False
     max_seq_len: int = 4096
     cache_size: int = 4096
@@ -155,6 +155,10 @@ class ExllamaV3Container(BaseModelContainer):
             if gpu_split:
                 self.gpu_split = gpu_split
 
+                # Causes crash if set with GPU split
+                # TODO: Remove when fixed in exllama upstream
+                self.autosplit_reserve = None
+
                 gpu_device_list = [
                     device_idx
                     for device_idx, memory in enumerate(self.gpu_split)