Model: Auto-detect a one GPU setup and fix gpu_split_auto
It makes more sense to use gpu split parameters when the user has >1 GPUs. Otherwise, set split and split_auto to False and save the user some VRAM. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
parent
849179df17
commit
c0ad647fa7
3 changed files with 15 additions and 9 deletions
|
|
@ -70,7 +70,7 @@ class ModelLoadRequest(BaseModel):
|
|||
default=None,
|
||||
examples=[4096],
|
||||
)
|
||||
gpu_split_auto: Optional[bool] = False
|
||||
gpu_split_auto: Optional[bool] = True
|
||||
gpu_split: Optional[List[float]] = Field(
|
||||
default_factory=list, examples=[[24.0, 20.0]]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -103,8 +103,15 @@ class ExllamaV2Container:
|
|||
self.quiet = quiet
|
||||
|
||||
self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8"
|
||||
self.gpu_split = kwargs.get("gpu_split")
|
||||
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False)
|
||||
|
||||
# Turn off GPU split if the user is using 1 GPU
|
||||
gpu_count = torch.cuda.device_count()
|
||||
if gpu_count > 1:
|
||||
self.gpu_split = kwargs.get("gpu_split")
|
||||
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||
else:
|
||||
self.gpu_split_auto = False
|
||||
logger.info("Disabling GPU split because one GPU is in use.")
|
||||
|
||||
self.config = ExLlamaV2Config()
|
||||
self.config.model_dir = str(model_directory.resolve())
|
||||
|
|
@ -354,9 +361,7 @@ class ExllamaV2Container:
|
|||
# Load model with manual split
|
||||
# Entrypoint for single GPU users
|
||||
if not self.gpu_split_auto:
|
||||
logger.info(
|
||||
"Loading with a manual GPU split (or a one GPU setup)"
|
||||
)
|
||||
logger.info("Loading with a manual GPU split (or a one GPU setup)")
|
||||
|
||||
for value in self.model.load_gen(
|
||||
self.gpu_split,
|
||||
|
|
|
|||
|
|
@ -68,11 +68,12 @@ model:
|
|||
# Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)
|
||||
#override_base_seq_len:
|
||||
|
||||
# Automatically allocate resources to GPUs (default: False)
|
||||
# WARNING: Will use more VRAM for single GPU users
|
||||
#gpu_split_auto: False
|
||||
# Automatically allocate resources to GPUs (default: True)
|
||||
# NOTE: Not parsed for single GPU users
|
||||
#gpu_split_auto: True
|
||||
|
||||
# An integer array of GBs of vram to split between GPUs (default: [])
|
||||
# NOTE: Not parsed for single GPU users
|
||||
#gpu_split: [20.6, 24]
|
||||
|
||||
# Rope scale (default: 1.0)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue