Model: Add tensor_parallel_backend option

This allows for users to use nccl or native depending on the GPU setup.
NCCL is only available with Linux built wheels.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
kingbri 2025-08-17 21:42:30 -04:00
parent b9952f319e
commit 43f9483bc4
4 changed files with 26 additions and 2 deletions

View file

@ -25,6 +25,7 @@ from loguru import logger
from backends.base_model_container import BaseModelContainer from backends.base_model_container import BaseModelContainer
from backends.exllamav3.sampler import ExllamaV3SamplerBuilder from backends.exllamav3.sampler import ExllamaV3SamplerBuilder
from backends.exllamav3.utils import exllama_supports_nccl
from backends.exllamav3.vision import clear_image_embedding_cache from backends.exllamav3.vision import clear_image_embedding_cache
from common.concurrency import iterate_in_threadpool from common.concurrency import iterate_in_threadpool
from common.gen_logging import ( from common.gen_logging import (
@ -78,6 +79,7 @@ class ExllamaV3Container(BaseModelContainer):
gpu_split_auto: bool = True gpu_split_auto: bool = True
autosplit_reserve: Optional[List[float]] = [96 / 1024] autosplit_reserve: Optional[List[float]] = [96 / 1024]
use_tp: bool = False use_tp: bool = False
tp_backend: str = "native"
max_seq_len: int = 4096 max_seq_len: int = 4096
cache_size: int = 4096 cache_size: int = 4096
cache_mode: str = "FP16" cache_mode: str = "FP16"
@ -173,6 +175,12 @@ class ExllamaV3Container(BaseModelContainer):
# Set tensor parallel # Set tensor parallel
if use_tp: if use_tp:
self.use_tp = True self.use_tp = True
tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native")
if not exllama_supports_nccl():
tp_backend = "native"
self.tp_backend = tp_backend
# TP has its own autosplit loader # TP has its own autosplit loader
self.gpu_split_auto = False self.gpu_split_auto = False
@ -458,7 +466,7 @@ class ExllamaV3Container(BaseModelContainer):
for value in self.model.load_gen( for value in self.model.load_gen(
tensor_p=self.use_tp, tensor_p=self.use_tp,
tp_backend="native", tp_backend=self.tp_backend,
reserve_per_device=self.autosplit_reserve, reserve_per_device=self.autosplit_reserve,
use_per_device=self.gpu_split, use_per_device=self.gpu_split,
callback=progress_callback, callback=progress_callback,

View file

@ -183,11 +183,20 @@ class ModelConfig(BaseConfigModel):
tensor_parallel: Optional[bool] = Field( tensor_parallel: Optional[bool] = Field(
False, False,
description=( description=(
"Load model with tensor parallelism.\n" "Load model with tensor parallelism (default: False).\n"
"Falls back to autosplit if GPU split isn't provided.\n" "Falls back to autosplit if GPU split isn't provided.\n"
"This ignores the gpu_split_auto value." "This ignores the gpu_split_auto value."
), ),
) )
tensor_parallel_backend: Optional[str] = Field(
"native",
description=(
"Sets a backend type for tensor parallelism. (default: native).\n"
"Options: native, nccl\n"
"Native is recommended for PCIe GPUs\n"
"NCCL is recommended for NVLink."
),
)
gpu_split_auto: Optional[bool] = Field( gpu_split_auto: Optional[bool] = Field(
True, True,
description=( description=(

View file

@ -87,6 +87,12 @@ model:
# This ignores the gpu_split_auto value. # This ignores the gpu_split_auto value.
tensor_parallel: false tensor_parallel: false
# Sets a backend type for tensor parallelism. (default: native).
# Options: native, nccl
# Native is recommended for PCIe GPUs
# NCCL is recommended for NVLink.
tensor_parallel_backend: native
# Automatically allocate resources to GPUs (default: True). # Automatically allocate resources to GPUs (default: True).
# Not parsed for single GPU users. # Not parsed for single GPU users.
gpu_split_auto: true gpu_split_auto: true

View file

@ -90,6 +90,7 @@ class ModelLoadRequest(BaseModel):
examples=[4096], examples=[4096],
) )
tensor_parallel: Optional[bool] = None tensor_parallel: Optional[bool] = None
tensor_parallel_backend: Optional[str] = "native"
gpu_split_auto: Optional[bool] = None gpu_split_auto: Optional[bool] = None
autosplit_reserve: Optional[List[float]] = None autosplit_reserve: Optional[List[float]] = None
gpu_split: Optional[List[float]] = Field( gpu_split: Optional[List[float]] = Field(