From 084916c04f90add14e0e59fa1bc0b1aa6c87f130 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Sat, 17 May 2025 00:51:14 -0400 Subject: [PATCH] Model: Fix autosplit reserve crash with GPU split ExllamaV3 does not accept autosplit_reserve and gpu_split at the same time. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav3/model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 52536fe..d80ee56 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -69,9 +69,9 @@ class ExllamaV3Container(BaseModelContainer): generator: Optional[AsyncGenerator] = None # Class-specific vars - gpu_split: List[float] | None = None + gpu_split: Optional[List[float]] = None gpu_split_auto: bool = True - autosplit_reserve: List[float] = [96 / 1024] + autosplit_reserve: Optional[List[float]] = [96 / 1024] use_tp: bool = False max_seq_len: int = 4096 cache_size: int = 4096 @@ -155,6 +155,10 @@ class ExllamaV3Container(BaseModelContainer): if gpu_split: self.gpu_split = gpu_split + # Causes crash if set with GPU split + # TODO: Remove when fixed in exllama upstream + self.autosplit_reserve = None + gpu_device_list = [ device_idx for device_idx, memory in enumerate(self.gpu_split)