From ab1f4b7a6accbfc9c49c6035f567a7b8ac03b61f Mon Sep 17 00:00:00 2001
From: lucy <154630366+lucyknada@users.noreply.github.com>
Date: Wed, 27 Nov 2024 02:52:19 +0100
Subject: [PATCH 1/2] add draft_gpu_split option

---
 backends/exllamav2/model.py | 56 +++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index ff11531..d692233 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -90,6 +90,7 @@ class ExllamaV2Container:
 
     # GPU split vars
     gpu_split: Optional[list] = None
+    draft_gpu_split: Optional[list] = None
     gpu_split_auto: bool = True
     autosplit_reserve: List[float] = [96 * 1024**2]
     use_tp: bool = False
@@ -180,6 +181,7 @@ class ExllamaV2Container:
             )
             draft_model_path = draft_model_path / draft_model_name
 
+            self.draft_gpu_split = draft_args.get("draft_gpu_split")
             self.draft_model_dir = draft_model_path
             self.draft_config.model_dir = str(draft_model_path.resolve())
             self.draft_config.prepare()
@@ -232,6 +234,16 @@ class ExllamaV2Container:
                     for value in autosplit_reserve_megabytes
                 ]
 
+            if self.draft_gpu_split:
+                self.gpu_split_auto = False
+                self.gpu_split = gpu_split
+
+                gpu_device_list = [
+                    device_idx
+                    for device_idx, memory in enumerate(self.draft_gpu_split)
+                    if memory > 0
+                ]
+
         # Hardcode max output length to 16
         self.config.max_output_len = 16
 
@@ -617,21 +629,37 @@ class ExllamaV2Container:
 
             # Draft uses the autosplit loader, so create a cache that reflects this
             draft_cache_class = self.get_cache_class(self.draft_cache_mode)
-            self.draft_cache = self.create_cache(
-                cache_class=draft_cache_class,
-                autosplit=True,
-                use_tp=False,
-                model=self.draft_model,
-            )
 
-            for value in self.draft_model.load_autosplit_gen(
-                self.draft_cache,
-                reserve_vram=autosplit_reserve,
-                last_id_only=True,
-                callback_gen=progress_callback,
-            ):
-                if value:
-                    yield value
+            if self.draft_gpu_split:
+                for value in self.draft_model.load_gen(
+                    self.draft_gpu_split,
+                    callback_gen=progress_callback,
+                ):
+                    if value:
+                        yield value
+
+                self.draft_cache = self.create_cache(
+                    cache_class=draft_cache_class,
+                    autosplit=False,
+                    use_tp=False,
+                    model=self.draft_model,
+                )
+            else:
+                self.draft_cache = self.create_cache(
+                    cache_class=draft_cache_class,
+                    autosplit=True,
+                    use_tp=False,
+                    model=self.draft_model,
+                )
+
+                for value in self.draft_model.load_autosplit_gen(
+                    self.draft_cache,
+                    reserve_vram=autosplit_reserve,
+                    last_id_only=True,
+                    callback_gen=progress_callback,
+                ):
+                    if value:
+                        yield value
 
             # Test VRAM allocation with a full-length forward pass
             input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)

From beb6d8faa5974c1e3f5084ab480215ca776d7c84 Mon Sep 17 00:00:00 2001
From: kingbri <8082010+kingbri1@users.noreply.github.com>
Date: Sat, 8 Feb 2025 16:09:46 -0500
Subject: [PATCH 2/2] Model: Adjust draft_gpu_split and add to config

The previous code overrode the existing gpu split and device idx
values. This now sets an independent draft_gpu_split value and
adjusts the gpu_devices check only if the draft_gpu_split array
is larger than the gpu_split array.

Draft gpu split is not Tensor Parallel, and defaults to gpu_split_auto
if a split is not provided.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
---
 backends/exllamav2/model.py | 16 ++++++++++------
 common/config_models.py     |  7 +++++++
 config_sample.yml           |  6 +++++-
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index d871f94..0684613 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -89,8 +89,8 @@ class ExllamaV2Container:
     generation_config: Optional[GenerationConfig] = None
 
     # GPU split vars
-    gpu_split: Optional[list] = None
-    draft_gpu_split: Optional[list] = None
+    gpu_split: List[float] = []
+    draft_gpu_split: List[float] = []
     gpu_split_auto: bool = True
     autosplit_reserve: List[float] = [96 * 1024**2]
     use_tp: bool = False
@@ -234,10 +234,9 @@ class ExllamaV2Container:
                     for value in autosplit_reserve_megabytes
                 ]
 
-            if self.draft_gpu_split:
-                self.gpu_split_auto = False
-                self.gpu_split = gpu_split
-
+            # Change the GPU device list only if gpu_split's list is too small
+            # This allows for an uneven list specification
+            if self.draft_gpu_split and len(self.draft_gpu_split) > len(self.gpu_split):
                 gpu_device_list = [
                     device_idx
                     for device_idx, memory in enumerate(self.draft_gpu_split)
@@ -387,6 +386,7 @@ class ExllamaV2Container:
             # Set draft cache mode
             self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
 
+            # Edit the draft config size
             if chunk_size:
                 self.draft_config.max_input_len = chunk_size
                 self.draft_config.max_attention_size = chunk_size**2
@@ -633,6 +633,8 @@ class ExllamaV2Container:
             draft_cache_class = self.get_cache_class(self.draft_cache_mode)
 
             if self.draft_gpu_split:
+                logger.info("Loading with a manual GPU split (or a one GPU setup)")
+
                 for value in self.draft_model.load_gen(
                     self.draft_gpu_split,
                     callback_gen=progress_callback,
@@ -647,6 +649,8 @@ class ExllamaV2Container:
                     model=self.draft_model,
                 )
             else:
+                logger.info("Loading with autosplit")
+
                 self.draft_cache = self.create_cache(
                     cache_class=draft_cache_class,
                     autosplit=True,
diff --git a/common/config_models.py b/common/config_models.py
index f7f0add..6c4ef05 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -351,6 +351,13 @@ class DraftModelConfig(BaseConfigModel):
             f"Possible values: {str(CACHE_SIZES)[15:-1]}."
         ),
     )
+    draft_gpu_split: List[float] = Field(
+        default_factory=list,
+        description=(
+            "An integer array of GBs of VRAM to split between GPUs (default: []).\n"
+            "If this isn't filled in, the draft model is autosplit."
+        ),
+    )
 
 
 class LoraInstanceModel(BaseConfigModel):
diff --git a/config_sample.yml b/config_sample.yml
index ebea5a1..8d33231 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -20,7 +20,7 @@ network:
   # Turn on this option if you are ONLY connecting from localhost.
   disable_auth: false
 
-  # Disable fetching external content in response to requests, such as images from URLs.
+  # Disable fetching external content in response to requests,such as images from URLs.
   disable_fetch_requests: false
 
   # Send tracebacks over the API (default: False).
@@ -166,6 +166,10 @@ draft_model:
   # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
   draft_cache_mode: FP16
 
+  # An integer array of GBs of VRAM to split between GPUs (default: []).
+  # If this isn't filled in, the draft model is autosplit.
+  draft_gpu_split: []
+
 # Options for Loras
 lora:
   # Directory to look for LoRAs (default: loras).