Model: Add tensor_parallel_backend option
This allows for users to use nccl or native depending on the GPU setup. NCCL is only available with Linux built wheels. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
parent
b9952f319e
commit
43f9483bc4
4 changed files with 26 additions and 2 deletions
|
|
@ -25,6 +25,7 @@ from loguru import logger
|
||||||
|
|
||||||
from backends.base_model_container import BaseModelContainer
|
from backends.base_model_container import BaseModelContainer
|
||||||
from backends.exllamav3.sampler import ExllamaV3SamplerBuilder
|
from backends.exllamav3.sampler import ExllamaV3SamplerBuilder
|
||||||
|
from backends.exllamav3.utils import exllama_supports_nccl
|
||||||
from backends.exllamav3.vision import clear_image_embedding_cache
|
from backends.exllamav3.vision import clear_image_embedding_cache
|
||||||
from common.concurrency import iterate_in_threadpool
|
from common.concurrency import iterate_in_threadpool
|
||||||
from common.gen_logging import (
|
from common.gen_logging import (
|
||||||
|
|
@ -78,6 +79,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
gpu_split_auto: bool = True
|
gpu_split_auto: bool = True
|
||||||
autosplit_reserve: Optional[List[float]] = [96 / 1024]
|
autosplit_reserve: Optional[List[float]] = [96 / 1024]
|
||||||
use_tp: bool = False
|
use_tp: bool = False
|
||||||
|
tp_backend: str = "native"
|
||||||
max_seq_len: int = 4096
|
max_seq_len: int = 4096
|
||||||
cache_size: int = 4096
|
cache_size: int = 4096
|
||||||
cache_mode: str = "FP16"
|
cache_mode: str = "FP16"
|
||||||
|
|
@ -173,6 +175,12 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
# Set tensor parallel
|
# Set tensor parallel
|
||||||
if use_tp:
|
if use_tp:
|
||||||
self.use_tp = True
|
self.use_tp = True
|
||||||
|
tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native")
|
||||||
|
|
||||||
|
if not exllama_supports_nccl():
|
||||||
|
tp_backend = "native"
|
||||||
|
|
||||||
|
self.tp_backend = tp_backend
|
||||||
|
|
||||||
# TP has its own autosplit loader
|
# TP has its own autosplit loader
|
||||||
self.gpu_split_auto = False
|
self.gpu_split_auto = False
|
||||||
|
|
@ -458,7 +466,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
|
|
||||||
for value in self.model.load_gen(
|
for value in self.model.load_gen(
|
||||||
tensor_p=self.use_tp,
|
tensor_p=self.use_tp,
|
||||||
tp_backend="native",
|
tp_backend=self.tp_backend,
|
||||||
reserve_per_device=self.autosplit_reserve,
|
reserve_per_device=self.autosplit_reserve,
|
||||||
use_per_device=self.gpu_split,
|
use_per_device=self.gpu_split,
|
||||||
callback=progress_callback,
|
callback=progress_callback,
|
||||||
|
|
|
||||||
|
|
@ -183,11 +183,20 @@ class ModelConfig(BaseConfigModel):
|
||||||
tensor_parallel: Optional[bool] = Field(
|
tensor_parallel: Optional[bool] = Field(
|
||||||
False,
|
False,
|
||||||
description=(
|
description=(
|
||||||
"Load model with tensor parallelism.\n"
|
"Load model with tensor parallelism (default: False).\n"
|
||||||
"Falls back to autosplit if GPU split isn't provided.\n"
|
"Falls back to autosplit if GPU split isn't provided.\n"
|
||||||
"This ignores the gpu_split_auto value."
|
"This ignores the gpu_split_auto value."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
tensor_parallel_backend: Optional[str] = Field(
|
||||||
|
"native",
|
||||||
|
description=(
|
||||||
|
"Sets a backend type for tensor parallelism. (default: native).\n"
|
||||||
|
"Options: native, nccl\n"
|
||||||
|
"Native is recommended for PCIe GPUs\n"
|
||||||
|
"NCCL is recommended for NVLink."
|
||||||
|
),
|
||||||
|
)
|
||||||
gpu_split_auto: Optional[bool] = Field(
|
gpu_split_auto: Optional[bool] = Field(
|
||||||
True,
|
True,
|
||||||
description=(
|
description=(
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,12 @@ model:
|
||||||
# This ignores the gpu_split_auto value.
|
# This ignores the gpu_split_auto value.
|
||||||
tensor_parallel: false
|
tensor_parallel: false
|
||||||
|
|
||||||
|
# Sets a backend type for tensor parallelism. (default: native).
|
||||||
|
# Options: native, nccl
|
||||||
|
# Native is recommended for PCIe GPUs
|
||||||
|
# NCCL is recommended for NVLink.
|
||||||
|
tensor_parallel_backend: native
|
||||||
|
|
||||||
# Automatically allocate resources to GPUs (default: True).
|
# Automatically allocate resources to GPUs (default: True).
|
||||||
# Not parsed for single GPU users.
|
# Not parsed for single GPU users.
|
||||||
gpu_split_auto: true
|
gpu_split_auto: true
|
||||||
|
|
|
||||||
|
|
@ -90,6 +90,7 @@ class ModelLoadRequest(BaseModel):
|
||||||
examples=[4096],
|
examples=[4096],
|
||||||
)
|
)
|
||||||
tensor_parallel: Optional[bool] = None
|
tensor_parallel: Optional[bool] = None
|
||||||
|
tensor_parallel_backend: Optional[str] = "native"
|
||||||
gpu_split_auto: Optional[bool] = None
|
gpu_split_auto: Optional[bool] = None
|
||||||
autosplit_reserve: Optional[List[float]] = None
|
autosplit_reserve: Optional[List[float]] = None
|
||||||
gpu_split: Optional[List[float]] = Field(
|
gpu_split: Optional[List[float]] = Field(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue