Model: Adjust draft_gpu_split and add to config

The previous code overrode the existing gpu split and device idx values. This now sets an independent draft_gpu_split value and adjusts the gpu_devices check only if the draft_gpu_split array is larger than the gpu_split array. Draft gpu split is not Tensor Parallel, and defaults to gpu_split_auto if a split is not provided. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2025-02-08 16:09:46 -05:00 · 2025-02-08 16:09:46 -05:00 · beb6d8faa5
commit beb6d8faa5
parent bd8256d168
3 changed files with 22 additions and 7 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@ -89,8 +89,8 @@ class ExllamaV2Container:
    generation_config: Optional[GenerationConfig] = None

    # GPU split vars
-    gpu_split: Optional[list] = None
-    draft_gpu_split: Optional[list] = None
+    gpu_split: List[float] = []
+    draft_gpu_split: List[float] = []
    gpu_split_auto: bool = True
    autosplit_reserve: List[float] = [96 * 1024**2]
    use_tp: bool = False
@ -234,10 +234,9 @@ class ExllamaV2Container:
                    for value in autosplit_reserve_megabytes
                ]

-            if self.draft_gpu_split:
-                self.gpu_split_auto = False
-                self.gpu_split = gpu_split
-
+            # Change the GPU device list only if gpu_split's list is too small
+            # This allows for an uneven list specification
+            if self.draft_gpu_split and len(self.draft_gpu_split) > len(self.gpu_split):
                gpu_device_list = [
                    device_idx
                    for device_idx, memory in enumerate(self.draft_gpu_split)
@ -387,6 +386,7 @@ class ExllamaV2Container:
            # Set draft cache mode
            self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")

+            # Edit the draft config size
            if chunk_size:
                self.draft_config.max_input_len = chunk_size
                self.draft_config.max_attention_size = chunk_size**2
@ -633,6 +633,8 @@ class ExllamaV2Container:
            draft_cache_class = self.get_cache_class(self.draft_cache_mode)

            if self.draft_gpu_split:
+                logger.info("Loading with a manual GPU split (or a one GPU setup)")
+
                for value in self.draft_model.load_gen(
                    self.draft_gpu_split,
                    callback_gen=progress_callback,
@ -647,6 +649,8 @@ class ExllamaV2Container:
                    model=self.draft_model,
                )
            else:
+                logger.info("Loading with autosplit")
+
                self.draft_cache = self.create_cache(
                    cache_class=draft_cache_class,
                    autosplit=True,
--- a/common/config_models.py
+++ b/common/config_models.py
@ -351,6 +351,13 @@ class DraftModelConfig(BaseConfigModel):
            f"Possible values: {str(CACHE_SIZES)[15:-1]}."
        ),
    )
+    draft_gpu_split: List[float] = Field(
+        default_factory=list,
+        description=(
+            "An integer array of GBs of VRAM to split between GPUs (default: []).\n"
+            "If this isn't filled in, the draft model is autosplit."
+        ),
+    )


 class LoraInstanceModel(BaseConfigModel):
--- a/config_sample.yml
+++ b/config_sample.yml
@ -20,7 +20,7 @@ network:
  # Turn on this option if you are ONLY connecting from localhost.
  disable_auth: false

-  # Disable fetching external content in response to requests, such as images from URLs.
+  # Disable fetching external content in response to requests,such as images from URLs.
  disable_fetch_requests: false

  # Send tracebacks over the API (default: False).
@ -166,6 +166,10 @@ draft_model:
  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
  draft_cache_mode: FP16

+  # An integer array of GBs of VRAM to split between GPUs (default: []).
+  # If this isn't filled in, the draft model is autosplit.
+  draft_gpu_split: []
+
 # Options for Loras
 lora:
  # Directory to look for LoRAs (default: loras).