From 306fc7cd153185012dc758b6655b670309830bb5 Mon Sep 17 00:00:00 2001 From: randoentity <137087500+randoentity@users.noreply.github.com> Date: Wed, 30 Apr 2025 09:43:33 +0200 Subject: [PATCH] fixup: autosplit reserve this probably breaks v2 support --- backends/exllamav3/model.py | 5 +++-- common/config_models.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 793ec35..f2dfd4f 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -49,7 +49,7 @@ class ExllamaV3Container(BaseModelContainer): config: Config gpu_split: List[float] | None = None gpu_split_auto: bool = True - autosplit_reserve: List[float] = [96 * 1024**2] + autosplit_reserve: List[float] = [96 / 1024] max_seq_len: int use_tp: bool = False @@ -112,7 +112,7 @@ class ExllamaV3Container(BaseModelContainer): # Reserve VRAM for each GPU self.autosplit_reserve = [ - int(math.ceil(value * 1024**2)) + int(math.ceil(value/1024)) for value in autosplit_reserve_megabytes ] # TODO: speculative decoding @@ -171,6 +171,7 @@ class ExllamaV3Container(BaseModelContainer): @torch.inference_mode() def load_model_sync(self, progress_callback=None): for value in self.model.load_gen( + reserve_per_device=self.autosplit_reserve, use_per_device=self.gpu_split, callback=progress_callback ): diff --git a/common/config_models.py b/common/config_models.py index bb09e57..d2af39e 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -193,7 +193,7 @@ class ModelConfig(BaseConfigModel): "Not parsed for single GPU users." ), ) - autosplit_reserve: List[int] = Field( + autosplit_reserve: List[float] = Field( [96], description=( "Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"