Model: Auto-detect a one GPU setup and fix gpu_split_auto

It makes more sense to use gpu split parameters when the user has >1 GPUs. Otherwise, set split and split_auto to False and save the user some VRAM. Signed-off-by: kingbri <bdashore3@proton.me>
2024-02-06 22:58:55 -05:00 · 2024-02-06 22:58:55 -05:00 · c0ad647fa7
commit c0ad647fa7
parent 849179df17
3 changed files with 15 additions and 9 deletions
--- a/OAI/types/model.py
+++ b/OAI/types/model.py
@ -70,7 +70,7 @@ class ModelLoadRequest(BaseModel):
        default=None,
        examples=[4096],
    )
-    gpu_split_auto: Optional[bool] = False
+    gpu_split_auto: Optional[bool] = True
    gpu_split: Optional[List[float]] = Field(
        default_factory=list, examples=[[24.0, 20.0]]
    )
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@ -103,8 +103,15 @@ class ExllamaV2Container:
        self.quiet = quiet

        self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8"
-        self.gpu_split = kwargs.get("gpu_split")
-        self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False)
+
+        # Turn off GPU split if the user is using 1 GPU
+        gpu_count = torch.cuda.device_count()
+        if gpu_count > 1:
+            self.gpu_split = kwargs.get("gpu_split")
+            self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
+        else:
+            self.gpu_split_auto = False
+            logger.info("Disabling GPU split because one GPU is in use.")

        self.config = ExLlamaV2Config()
        self.config.model_dir = str(model_directory.resolve())
@ -354,9 +361,7 @@ class ExllamaV2Container:
        # Load model with manual split
        # Entrypoint for single GPU users
        if not self.gpu_split_auto:
-            logger.info(
-                "Loading with a manual GPU split (or a one GPU setup)"
-            )
+            logger.info("Loading with a manual GPU split (or a one GPU setup)")

            for value in self.model.load_gen(
                self.gpu_split,
--- a/config_sample.yml
+++ b/config_sample.yml
@ -68,11 +68,12 @@ model:
  # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)
  #override_base_seq_len:

-  # Automatically allocate resources to GPUs (default: False)
-  # WARNING: Will use more VRAM for single GPU users
-  #gpu_split_auto: False
+  # Automatically allocate resources to GPUs (default: True)
+  # NOTE: Not parsed for single GPU users
+  #gpu_split_auto: True

  # An integer array of GBs of vram to split between GPUs (default: [])
+  # NOTE: Not parsed for single GPU users
  #gpu_split: [20.6, 24]

  # Rope scale (default: 1.0)