Model: Add tensor_parallel_backend option

This allows for users to use nccl or native depending on the GPU setup. NCCL is only available with Linux built wheels. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2025-08-17 21:42:30 -04:00 · 2025-08-17 21:42:30 -04:00 · 43f9483bc4
commit 43f9483bc4
parent b9952f319e
4 changed files with 26 additions and 2 deletions
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@ -25,6 +25,7 @@ from loguru import logger
 from backends.base_model_container import BaseModelContainer
 from backends.exllamav3.sampler import ExllamaV3SamplerBuilder
 from backends.exllamav3.utils import exllama_supports_nccl
 from backends.exllamav3.vision import clear_image_embedding_cache
 from common.concurrency import iterate_in_threadpool
 from common.gen_logging import (
@ -78,6 +79,7 @@ class ExllamaV3Container(BaseModelContainer):
    gpu_split_auto: bool = True
    autosplit_reserve: Optional[List[float]] = [96 / 1024]
    use_tp: bool = False
    tp_backend: str = "native"
    max_seq_len: int = 4096
    cache_size: int = 4096
    cache_mode: str = "FP16"
@ -173,6 +175,12 @@ class ExllamaV3Container(BaseModelContainer):
            # Set tensor parallel
            if use_tp:
                self.use_tp = True
                tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native")
                if not exllama_supports_nccl():
                    tp_backend = "native"
                self.tp_backend = tp_backend
                # TP has its own autosplit loader
                self.gpu_split_auto = False
@ -458,7 +466,7 @@ class ExllamaV3Container(BaseModelContainer):
        for value in self.model.load_gen(
            tensor_p=self.use_tp,
-            tp_backend="native",
+            tp_backend=self.tp_backend,
            reserve_per_device=self.autosplit_reserve,
            use_per_device=self.gpu_split,
            callback=progress_callback,
--- a/common/config_models.py
+++ b/common/config_models.py
@ -183,11 +183,20 @@ class ModelConfig(BaseConfigModel):
    tensor_parallel: Optional[bool] = Field(
        False,
        description=(
-            "Load model with tensor parallelism.\n"
+            "Load model with tensor parallelism (default: False).\n"
            "Falls back to autosplit if GPU split isn't provided.\n"
            "This ignores the gpu_split_auto value."
        ),
    )
    tensor_parallel_backend: Optional[str] = Field(
        "native",
        description=(
            "Sets a backend type for tensor parallelism. (default: native).\n"
            "Options: native, nccl\n"
            "Native is recommended for PCIe GPUs\n"
            "NCCL is recommended for NVLink."
        ),
    )
    gpu_split_auto: Optional[bool] = Field(
        True,
        description=(
--- a/config_sample.yml
+++ b/config_sample.yml
@ -87,6 +87,12 @@ model:
  # This ignores the gpu_split_auto value.
  tensor_parallel: false
  # Sets a backend type for tensor parallelism. (default: native).
  # Options: native, nccl
  # Native is recommended for PCIe GPUs
  # NCCL is recommended for NVLink.
  tensor_parallel_backend: native
  # Automatically allocate resources to GPUs (default: True).
  # Not parsed for single GPU users.
  gpu_split_auto: true
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@ -90,6 +90,7 @@ class ModelLoadRequest(BaseModel):
        examples=[4096],
    )
    tensor_parallel: Optional[bool] = None
    tensor_parallel_backend: Optional[str] = "native"
    gpu_split_auto: Optional[bool] = None
    autosplit_reserve: Optional[List[float]] = None
    gpu_split: Optional[List[float]] = Field(