fixup: autosplit reserve
this probably breaks v2 support
This commit is contained in:
parent
acb3adb953
commit
306fc7cd15
2 changed files with 4 additions and 3 deletions
|
|
@ -49,7 +49,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
config: Config
|
config: Config
|
||||||
gpu_split: List[float] | None = None
|
gpu_split: List[float] | None = None
|
||||||
gpu_split_auto: bool = True
|
gpu_split_auto: bool = True
|
||||||
autosplit_reserve: List[float] = [96 * 1024**2]
|
autosplit_reserve: List[float] = [96 / 1024]
|
||||||
max_seq_len: int
|
max_seq_len: int
|
||||||
use_tp: bool = False
|
use_tp: bool = False
|
||||||
|
|
||||||
|
|
@ -112,7 +112,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
|
|
||||||
# Reserve VRAM for each GPU
|
# Reserve VRAM for each GPU
|
||||||
self.autosplit_reserve = [
|
self.autosplit_reserve = [
|
||||||
int(math.ceil(value * 1024**2))
|
int(math.ceil(value/1024))
|
||||||
for value in autosplit_reserve_megabytes
|
for value in autosplit_reserve_megabytes
|
||||||
]
|
]
|
||||||
# TODO: speculative decoding
|
# TODO: speculative decoding
|
||||||
|
|
@ -171,6 +171,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def load_model_sync(self, progress_callback=None):
|
def load_model_sync(self, progress_callback=None):
|
||||||
for value in self.model.load_gen(
|
for value in self.model.load_gen(
|
||||||
|
reserve_per_device=self.autosplit_reserve,
|
||||||
use_per_device=self.gpu_split,
|
use_per_device=self.gpu_split,
|
||||||
callback=progress_callback
|
callback=progress_callback
|
||||||
):
|
):
|
||||||
|
|
|
||||||
|
|
@ -193,7 +193,7 @@ class ModelConfig(BaseConfigModel):
|
||||||
"Not parsed for single GPU users."
|
"Not parsed for single GPU users."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
autosplit_reserve: List[int] = Field(
|
autosplit_reserve: List[float] = Field(
|
||||||
[96],
|
[96],
|
||||||
description=(
|
description=(
|
||||||
"Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"
|
"Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue