Config: Expose auto GPU split reserve config
The GPU reserve is used as a VRAM buffer to prevent GPU overflow when automatically deciding how to load a model on multiple GPUs. Make this configurable. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
parent
43bba526bf
commit
2f568ff573
3 changed files with 26 additions and 10 deletions
|
|
@ -71,6 +71,7 @@ class ModelLoadRequest(BaseModel):
|
|||
examples=[4096],
|
||||
)
|
||||
gpu_split_auto: Optional[bool] = True
|
||||
autosplit_reserve: Optional[List[float]] = [96]
|
||||
gpu_split: Optional[List[float]] = Field(
|
||||
default_factory=list, examples=[[24.0, 20.0]]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -28,13 +28,11 @@ from common.logger import init_logger
|
|||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# Bytes to reserve on first device when loading with auto split
|
||||
AUTO_SPLIT_RESERVE_BYTES = 96 * 1024**2
|
||||
|
||||
|
||||
class ExllamaV2Container:
|
||||
"""The model container class for ExLlamaV2 models."""
|
||||
|
||||
# Exl2 vars
|
||||
config: Optional[ExLlamaV2Config] = None
|
||||
draft_config: Optional[ExLlamaV2Config] = None
|
||||
model: Optional[ExLlamaV2] = None
|
||||
|
|
@ -44,13 +42,16 @@ class ExllamaV2Container:
|
|||
tokenizer: Optional[ExLlamaV2Tokenizer] = None
|
||||
generator: Optional[ExLlamaV2StreamingGenerator] = None
|
||||
prompt_template: Optional[PromptTemplate] = None
|
||||
active_loras: List[ExLlamaV2Lora] = []
|
||||
|
||||
# Internal config vars
|
||||
cache_fp8: bool = False
|
||||
gpu_split_auto: bool = True
|
||||
gpu_split: Optional[list] = None
|
||||
use_cfg: bool = False
|
||||
|
||||
active_loras: List[ExLlamaV2Lora] = []
|
||||
# GPU split vars
|
||||
gpu_split: Optional[list] = None
|
||||
gpu_split_auto: bool = True
|
||||
autosplit_reserve: List[float] = [96 * 1024**2]
|
||||
|
||||
def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs):
|
||||
"""
|
||||
|
|
@ -109,7 +110,13 @@ class ExllamaV2Container:
|
|||
gpu_count = torch.cuda.device_count()
|
||||
if gpu_count > 1:
|
||||
self.gpu_split = kwargs.get("gpu_split")
|
||||
|
||||
# Auto GPU split parameters
|
||||
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||
autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
|
||||
self.autosplit_reserve = list(
|
||||
map(lambda value: value * 1024**2, autosplit_reserve_megabytes)
|
||||
)
|
||||
else:
|
||||
self.gpu_split_auto = False
|
||||
logger.info("Disabling GPU split because one GPU is in use.")
|
||||
|
|
@ -336,6 +343,12 @@ class ExllamaV2Container:
|
|||
# Load tokenizer
|
||||
self.tokenizer = ExLlamaV2Tokenizer(self.config)
|
||||
|
||||
# Calculate autosplit reserve for all GPUs
|
||||
gpu_count = torch.cuda.device_count()
|
||||
autosplit_reserve = self.autosplit_reserve + [0] * (
|
||||
gpu_count - len(self.autosplit_reserve)
|
||||
)
|
||||
|
||||
# Load draft model if a config is present
|
||||
if self.draft_config:
|
||||
self.draft_model = ExLlamaV2(self.draft_config)
|
||||
|
|
@ -343,10 +356,9 @@ class ExllamaV2Container:
|
|||
logger.info("Loading draft model: " + self.draft_config.model_dir)
|
||||
|
||||
self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True)
|
||||
reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
|
||||
yield from self.draft_model.load_autosplit_gen(
|
||||
self.draft_cache,
|
||||
reserve_vram=reserve,
|
||||
reserve_vram=autosplit_reserve,
|
||||
last_id_only=True,
|
||||
callback_gen=progress_callback,
|
||||
)
|
||||
|
|
@ -385,10 +397,9 @@ class ExllamaV2Container:
|
|||
if self.gpu_split_auto:
|
||||
logger.info("Loading with autosplit")
|
||||
|
||||
reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
|
||||
for value in self.model.load_autosplit_gen(
|
||||
self.cache,
|
||||
reserve_vram=reserve,
|
||||
reserve_vram=autosplit_reserve,
|
||||
last_id_only=True,
|
||||
callback_gen=progress_callback,
|
||||
):
|
||||
|
|
|
|||
|
|
@ -76,6 +76,10 @@ model:
|
|||
# NOTE: Not parsed for single GPU users
|
||||
#gpu_split_auto: True
|
||||
|
||||
# Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0)
|
||||
# This is represented as an array of MB per GPU used
|
||||
#autosplit_reserve: [96]
|
||||
|
||||
# An integer array of GBs of vram to split between GPUs (default: [])
|
||||
# NOTE: Not parsed for single GPU users
|
||||
#gpu_split: [20.6, 24]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue