From 43f9483bc435083fc3751e664c14f2eb6acb6ad2 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Sun, 17 Aug 2025 21:42:30 -0400 Subject: [PATCH] Model: Add tensor_parallel_backend option This allows for users to use nccl or native depending on the GPU setup. NCCL is only available with Linux built wheels. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav3/model.py | 10 +++++++++- common/config_models.py | 11 ++++++++++- config_sample.yml | 6 ++++++ endpoints/core/types/model.py | 1 + 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 05ae386..53d2e91 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -25,6 +25,7 @@ from loguru import logger from backends.base_model_container import BaseModelContainer from backends.exllamav3.sampler import ExllamaV3SamplerBuilder +from backends.exllamav3.utils import exllama_supports_nccl from backends.exllamav3.vision import clear_image_embedding_cache from common.concurrency import iterate_in_threadpool from common.gen_logging import ( @@ -78,6 +79,7 @@ class ExllamaV3Container(BaseModelContainer): gpu_split_auto: bool = True autosplit_reserve: Optional[List[float]] = [96 / 1024] use_tp: bool = False + tp_backend: str = "native" max_seq_len: int = 4096 cache_size: int = 4096 cache_mode: str = "FP16" @@ -173,6 +175,12 @@ class ExllamaV3Container(BaseModelContainer): # Set tensor parallel if use_tp: self.use_tp = True + tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native") + + if not exllama_supports_nccl(): + tp_backend = "native" + + self.tp_backend = tp_backend # TP has its own autosplit loader self.gpu_split_auto = False @@ -458,7 +466,7 @@ class ExllamaV3Container(BaseModelContainer): for value in self.model.load_gen( tensor_p=self.use_tp, - tp_backend="native", + tp_backend=self.tp_backend, reserve_per_device=self.autosplit_reserve, use_per_device=self.gpu_split, callback=progress_callback, diff --git a/common/config_models.py b/common/config_models.py index b2b36e4..9898c04 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -183,11 +183,20 @@ class ModelConfig(BaseConfigModel): tensor_parallel: Optional[bool] = Field( False, description=( - "Load model with tensor parallelism.\n" + "Load model with tensor parallelism (default: False).\n" "Falls back to autosplit if GPU split isn't provided.\n" "This ignores the gpu_split_auto value." ), ) + tensor_parallel_backend: Optional[str] = Field( + "native", + description=( + "Sets a backend type for tensor parallelism. (default: native).\n" + "Options: native, nccl\n" + "Native is recommended for PCIe GPUs\n" + "NCCL is recommended for NVLink." + ), + ) gpu_split_auto: Optional[bool] = Field( True, description=( diff --git a/config_sample.yml b/config_sample.yml index 97cafa6..890596e 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -87,6 +87,12 @@ model: # This ignores the gpu_split_auto value. tensor_parallel: false + # Sets a backend type for tensor parallelism. (default: native). + # Options: native, nccl + # Native is recommended for PCIe GPUs + # NCCL is recommended for NVLink. + tensor_parallel_backend: native + # Automatically allocate resources to GPUs (default: True). # Not parsed for single GPU users. gpu_split_auto: true diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 8e50622..1e84336 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -90,6 +90,7 @@ class ModelLoadRequest(BaseModel): examples=[4096], ) tensor_parallel: Optional[bool] = None + tensor_parallel_backend: Optional[str] = "native" gpu_split_auto: Optional[bool] = None autosplit_reserve: Optional[List[float]] = None gpu_split: Optional[List[float]] = Field(