From 43f9483bc435083fc3751e664c14f2eb6acb6ad2 Mon Sep 17 00:00:00 2001
From: kingbri <8082010+kingbri1@users.noreply.github.com>
Date: Sun, 17 Aug 2025 21:42:30 -0400
Subject: [PATCH] Model: Add tensor_parallel_backend option

This allows for users to use nccl or native depending on the GPU setup.
NCCL is only available with Linux built wheels.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
---
 backends/exllamav3/model.py   | 10 +++++++++-
 common/config_models.py       | 11 ++++++++++-
 config_sample.yml             |  6 ++++++
 endpoints/core/types/model.py |  1 +
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index 05ae386..53d2e91 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -25,6 +25,7 @@ from loguru import logger
 
 from backends.base_model_container import BaseModelContainer
 from backends.exllamav3.sampler import ExllamaV3SamplerBuilder
+from backends.exllamav3.utils import exllama_supports_nccl
 from backends.exllamav3.vision import clear_image_embedding_cache
 from common.concurrency import iterate_in_threadpool
 from common.gen_logging import (
@@ -78,6 +79,7 @@ class ExllamaV3Container(BaseModelContainer):
     gpu_split_auto: bool = True
     autosplit_reserve: Optional[List[float]] = [96 / 1024]
     use_tp: bool = False
+    tp_backend: str = "native"
     max_seq_len: int = 4096
     cache_size: int = 4096
     cache_mode: str = "FP16"
@@ -173,6 +175,12 @@ class ExllamaV3Container(BaseModelContainer):
             # Set tensor parallel
             if use_tp:
                 self.use_tp = True
+                tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native")
+
+                if not exllama_supports_nccl():
+                    tp_backend = "native"
+
+                self.tp_backend = tp_backend
 
                 # TP has its own autosplit loader
                 self.gpu_split_auto = False
@@ -458,7 +466,7 @@ class ExllamaV3Container(BaseModelContainer):
 
         for value in self.model.load_gen(
             tensor_p=self.use_tp,
-            tp_backend="native",
+            tp_backend=self.tp_backend,
             reserve_per_device=self.autosplit_reserve,
             use_per_device=self.gpu_split,
             callback=progress_callback,
diff --git a/common/config_models.py b/common/config_models.py
index b2b36e4..9898c04 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -183,11 +183,20 @@ class ModelConfig(BaseConfigModel):
     tensor_parallel: Optional[bool] = Field(
         False,
         description=(
-            "Load model with tensor parallelism.\n"
+            "Load model with tensor parallelism (default: False).\n"
             "Falls back to autosplit if GPU split isn't provided.\n"
             "This ignores the gpu_split_auto value."
         ),
     )
+    tensor_parallel_backend: Optional[str] = Field(
+        "native",
+        description=(
+            "Sets a backend type for tensor parallelism. (default: native).\n"
+            "Options: native, nccl\n"
+            "Native is recommended for PCIe GPUs\n"
+            "NCCL is recommended for NVLink."
+        ),
+    )
     gpu_split_auto: Optional[bool] = Field(
         True,
         description=(
diff --git a/config_sample.yml b/config_sample.yml
index 97cafa6..890596e 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -87,6 +87,12 @@ model:
   # This ignores the gpu_split_auto value.
   tensor_parallel: false
 
+  # Sets a backend type for tensor parallelism. (default: native).
+  # Options: native, nccl
+  # Native is recommended for PCIe GPUs
+  # NCCL is recommended for NVLink.
+  tensor_parallel_backend: native
+
   # Automatically allocate resources to GPUs (default: True).
   # Not parsed for single GPU users.
   gpu_split_auto: true
diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py
index 8e50622..1e84336 100644
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -90,6 +90,7 @@ class ModelLoadRequest(BaseModel):
         examples=[4096],
     )
     tensor_parallel: Optional[bool] = None
+    tensor_parallel_backend: Optional[str] = "native"
     gpu_split_auto: Optional[bool] = None
     autosplit_reserve: Optional[List[float]] = None
     gpu_split: Optional[List[float]] = Field(