fixup: autosplit reserve

this probably breaks v2 support
2025-04-30 09:43:33 +02:00 · 2025-04-30 09:43:33 +02:00 · 306fc7cd15
commit 306fc7cd15
parent acb3adb953
2 changed files with 4 additions and 3 deletions
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@ -49,7 +49,7 @@ class ExllamaV3Container(BaseModelContainer):
    config: Config
    gpu_split: List[float] | None = None
    gpu_split_auto: bool = True
-    autosplit_reserve: List[float] = [96 * 1024**2]
+    autosplit_reserve: List[float] = [96 / 1024]
    max_seq_len: int
    use_tp: bool = False

@ -112,7 +112,7 @@ class ExllamaV3Container(BaseModelContainer):

                # Reserve VRAM for each GPU
                self.autosplit_reserve = [
-                    int(math.ceil(value * 1024**2))
+                    int(math.ceil(value/1024))
                    for value in autosplit_reserve_megabytes
                ]
        # TODO: speculative decoding
@ -171,6 +171,7 @@ class ExllamaV3Container(BaseModelContainer):
    @torch.inference_mode()
    def load_model_sync(self, progress_callback=None):
        for value in self.model.load_gen(
+            reserve_per_device=self.autosplit_reserve,
            use_per_device=self.gpu_split,
            callback=progress_callback
        ):
--- a/common/config_models.py
+++ b/common/config_models.py
@ -193,7 +193,7 @@ class ModelConfig(BaseConfigModel):
            "Not parsed for single GPU users."
        ),
    )
-    autosplit_reserve: List[int] = Field(
+    autosplit_reserve: List[float] = Field(
        [96],
        description=(
            "Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"