From ab1f4b7a6accbfc9c49c6035f567a7b8ac03b61f Mon Sep 17 00:00:00 2001 From: lucy <154630366+lucyknada@users.noreply.github.com> Date: Wed, 27 Nov 2024 02:52:19 +0100 Subject: [PATCH 1/2] add draft_gpu_split option --- backends/exllamav2/model.py | 56 +++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index ff11531..d692233 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -90,6 +90,7 @@ class ExllamaV2Container: # GPU split vars gpu_split: Optional[list] = None + draft_gpu_split: Optional[list] = None gpu_split_auto: bool = True autosplit_reserve: List[float] = [96 * 1024**2] use_tp: bool = False @@ -180,6 +181,7 @@ class ExllamaV2Container: ) draft_model_path = draft_model_path / draft_model_name + self.draft_gpu_split = draft_args.get("draft_gpu_split") self.draft_model_dir = draft_model_path self.draft_config.model_dir = str(draft_model_path.resolve()) self.draft_config.prepare() @@ -232,6 +234,16 @@ class ExllamaV2Container: for value in autosplit_reserve_megabytes ] + if self.draft_gpu_split: + self.gpu_split_auto = False + self.gpu_split = gpu_split + + gpu_device_list = [ + device_idx + for device_idx, memory in enumerate(self.draft_gpu_split) + if memory > 0 + ] + # Hardcode max output length to 16 self.config.max_output_len = 16 @@ -617,21 +629,37 @@ class ExllamaV2Container: # Draft uses the autosplit loader, so create a cache that reflects this draft_cache_class = self.get_cache_class(self.draft_cache_mode) - self.draft_cache = self.create_cache( - cache_class=draft_cache_class, - autosplit=True, - use_tp=False, - model=self.draft_model, - ) - for value in self.draft_model.load_autosplit_gen( - self.draft_cache, - reserve_vram=autosplit_reserve, - last_id_only=True, - callback_gen=progress_callback, - ): - if value: - yield value + if self.draft_gpu_split: + for value in self.draft_model.load_gen( + self.draft_gpu_split, + callback_gen=progress_callback, + ): + if value: + yield value + + self.draft_cache = self.create_cache( + cache_class=draft_cache_class, + autosplit=False, + use_tp=False, + model=self.draft_model, + ) + else: + self.draft_cache = self.create_cache( + cache_class=draft_cache_class, + autosplit=True, + use_tp=False, + model=self.draft_model, + ) + + for value in self.draft_model.load_autosplit_gen( + self.draft_cache, + reserve_vram=autosplit_reserve, + last_id_only=True, + callback_gen=progress_callback, + ): + if value: + yield value # Test VRAM allocation with a full-length forward pass input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long) From beb6d8faa5974c1e3f5084ab480215ca776d7c84 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Sat, 8 Feb 2025 16:09:46 -0500 Subject: [PATCH 2/2] Model: Adjust draft_gpu_split and add to config The previous code overrode the existing gpu split and device idx values. This now sets an independent draft_gpu_split value and adjusts the gpu_devices check only if the draft_gpu_split array is larger than the gpu_split array. Draft gpu split is not Tensor Parallel, and defaults to gpu_split_auto if a split is not provided. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav2/model.py | 16 ++++++++++------ common/config_models.py | 7 +++++++ config_sample.yml | 6 +++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index d871f94..0684613 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -89,8 +89,8 @@ class ExllamaV2Container: generation_config: Optional[GenerationConfig] = None # GPU split vars - gpu_split: Optional[list] = None - draft_gpu_split: Optional[list] = None + gpu_split: List[float] = [] + draft_gpu_split: List[float] = [] gpu_split_auto: bool = True autosplit_reserve: List[float] = [96 * 1024**2] use_tp: bool = False @@ -234,10 +234,9 @@ class ExllamaV2Container: for value in autosplit_reserve_megabytes ] - if self.draft_gpu_split: - self.gpu_split_auto = False - self.gpu_split = gpu_split - + # Change the GPU device list only if gpu_split's list is too small + # This allows for an uneven list specification + if self.draft_gpu_split and len(self.draft_gpu_split) > len(self.gpu_split): gpu_device_list = [ device_idx for device_idx, memory in enumerate(self.draft_gpu_split) @@ -387,6 +386,7 @@ class ExllamaV2Container: # Set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") + # Edit the draft config size if chunk_size: self.draft_config.max_input_len = chunk_size self.draft_config.max_attention_size = chunk_size**2 @@ -633,6 +633,8 @@ class ExllamaV2Container: draft_cache_class = self.get_cache_class(self.draft_cache_mode) if self.draft_gpu_split: + logger.info("Loading with a manual GPU split (or a one GPU setup)") + for value in self.draft_model.load_gen( self.draft_gpu_split, callback_gen=progress_callback, @@ -647,6 +649,8 @@ class ExllamaV2Container: model=self.draft_model, ) else: + logger.info("Loading with autosplit") + self.draft_cache = self.create_cache( cache_class=draft_cache_class, autosplit=True, diff --git a/common/config_models.py b/common/config_models.py index f7f0add..6c4ef05 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -351,6 +351,13 @@ class DraftModelConfig(BaseConfigModel): f"Possible values: {str(CACHE_SIZES)[15:-1]}." ), ) + draft_gpu_split: List[float] = Field( + default_factory=list, + description=( + "An integer array of GBs of VRAM to split between GPUs (default: []).\n" + "If this isn't filled in, the draft model is autosplit." + ), + ) class LoraInstanceModel(BaseConfigModel): diff --git a/config_sample.yml b/config_sample.yml index ebea5a1..8d33231 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -20,7 +20,7 @@ network: # Turn on this option if you are ONLY connecting from localhost. disable_auth: false - # Disable fetching external content in response to requests, such as images from URLs. + # Disable fetching external content in response to requests,such as images from URLs. disable_fetch_requests: false # Send tracebacks over the API (default: False). @@ -166,6 +166,10 @@ draft_model: # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'. draft_cache_mode: FP16 + # An integer array of GBs of VRAM to split between GPUs (default: []). + # If this isn't filled in, the draft model is autosplit. + draft_gpu_split: [] + # Options for Loras lora: # Directory to look for LoRAs (default: loras).