fixup: autosplit reserve

this probably breaks v2 support
This commit is contained in:
randoentity 2025-04-30 09:43:33 +02:00 committed by kingbri
parent acb3adb953
commit 306fc7cd15
2 changed files with 4 additions and 3 deletions

View file

@ -49,7 +49,7 @@ class ExllamaV3Container(BaseModelContainer):
config: Config
gpu_split: List[float] | None = None
gpu_split_auto: bool = True
autosplit_reserve: List[float] = [96 * 1024**2]
autosplit_reserve: List[float] = [96 / 1024]
max_seq_len: int
use_tp: bool = False
@ -112,7 +112,7 @@ class ExllamaV3Container(BaseModelContainer):
# Reserve VRAM for each GPU
self.autosplit_reserve = [
int(math.ceil(value * 1024**2))
int(math.ceil(value/1024))
for value in autosplit_reserve_megabytes
]
# TODO: speculative decoding
@ -171,6 +171,7 @@ class ExllamaV3Container(BaseModelContainer):
@torch.inference_mode()
def load_model_sync(self, progress_callback=None):
for value in self.model.load_gen(
reserve_per_device=self.autosplit_reserve,
use_per_device=self.gpu_split,
callback=progress_callback
):

View file

@ -193,7 +193,7 @@ class ModelConfig(BaseConfigModel):
"Not parsed for single GPU users."
),
)
autosplit_reserve: List[int] = Field(
autosplit_reserve: List[float] = Field(
[96],
description=(
"Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"