Config: Expose auto GPU split reserve config

The GPU reserve is used as a VRAM buffer to prevent GPU overflow
when automatically deciding how to load a model on multiple GPUs.
Make this configurable.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri 2024-02-08 22:08:51 -05:00
parent 43bba526bf
commit 2f568ff573
3 changed files with 26 additions and 10 deletions

View file

@ -71,6 +71,7 @@ class ModelLoadRequest(BaseModel):
examples=[4096],
)
gpu_split_auto: Optional[bool] = True
autosplit_reserve: Optional[List[float]] = [96]
gpu_split: Optional[List[float]] = Field(
default_factory=list, examples=[[24.0, 20.0]]
)

View file

@ -28,13 +28,11 @@ from common.logger import init_logger
logger = init_logger(__name__)
# Bytes to reserve on first device when loading with auto split
AUTO_SPLIT_RESERVE_BYTES = 96 * 1024**2
class ExllamaV2Container:
"""The model container class for ExLlamaV2 models."""
# Exl2 vars
config: Optional[ExLlamaV2Config] = None
draft_config: Optional[ExLlamaV2Config] = None
model: Optional[ExLlamaV2] = None
@ -44,13 +42,16 @@ class ExllamaV2Container:
tokenizer: Optional[ExLlamaV2Tokenizer] = None
generator: Optional[ExLlamaV2StreamingGenerator] = None
prompt_template: Optional[PromptTemplate] = None
active_loras: List[ExLlamaV2Lora] = []
# Internal config vars
cache_fp8: bool = False
gpu_split_auto: bool = True
gpu_split: Optional[list] = None
use_cfg: bool = False
active_loras: List[ExLlamaV2Lora] = []
# GPU split vars
gpu_split: Optional[list] = None
gpu_split_auto: bool = True
autosplit_reserve: List[float] = [96 * 1024**2]
def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs):
"""
@ -109,7 +110,13 @@ class ExllamaV2Container:
gpu_count = torch.cuda.device_count()
if gpu_count > 1:
self.gpu_split = kwargs.get("gpu_split")
# Auto GPU split parameters
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
self.autosplit_reserve = list(
map(lambda value: value * 1024**2, autosplit_reserve_megabytes)
)
else:
self.gpu_split_auto = False
logger.info("Disabling GPU split because one GPU is in use.")
@ -336,6 +343,12 @@ class ExllamaV2Container:
# Load tokenizer
self.tokenizer = ExLlamaV2Tokenizer(self.config)
# Calculate autosplit reserve for all GPUs
gpu_count = torch.cuda.device_count()
autosplit_reserve = self.autosplit_reserve + [0] * (
gpu_count - len(self.autosplit_reserve)
)
# Load draft model if a config is present
if self.draft_config:
self.draft_model = ExLlamaV2(self.draft_config)
@ -343,10 +356,9 @@ class ExllamaV2Container:
logger.info("Loading draft model: " + self.draft_config.model_dir)
self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True)
reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
yield from self.draft_model.load_autosplit_gen(
self.draft_cache,
reserve_vram=reserve,
reserve_vram=autosplit_reserve,
last_id_only=True,
callback_gen=progress_callback,
)
@ -385,10 +397,9 @@ class ExllamaV2Container:
if self.gpu_split_auto:
logger.info("Loading with autosplit")
reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
for value in self.model.load_autosplit_gen(
self.cache,
reserve_vram=reserve,
reserve_vram=autosplit_reserve,
last_id_only=True,
callback_gen=progress_callback,
):

View file

@ -76,6 +76,10 @@ model:
# NOTE: Not parsed for single GPU users
#gpu_split_auto: True
# Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0)
# This is represented as an array of MB per GPU used
#autosplit_reserve: [96]
# An integer array of GBs of vram to split between GPUs (default: [])
# NOTE: Not parsed for single GPU users
#gpu_split: [20.6, 24]