Model.py TP changes

This commit is contained in:
Forkoz 2025-08-12 21:01:54 +00:00 committed by GitHub
parent 6623dbcd86
commit 60ae419746
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -163,13 +163,19 @@ class ExllamaV3Container(BaseModelContainer):
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
gpu_split = unwrap(kwargs.get("gpu_split"), None)
gpu_device_list = list(range(0, gpu_count))
use_tp = unwrap(kwargs.get("tensor_parallel"), False)
# Set GPU split options
if gpu_count == 1:
self.gpu_split_auto = False
logger.info("Disabling GPU split because one GPU is in use.")
else:
# TODO: Set tensor parallel
# Set tensor parallel
if use_tp:
self.use_tp = True
# TP has its own autosplit loader
self.gpu_split_auto = False
# Set GPU split options
# Enable manual GPU split if provided
@ -451,6 +457,8 @@ class ExllamaV3Container(BaseModelContainer):
yield value
for value in self.model.load_gen(
tensor_p=self.use_tp,
tp_backend="native",
reserve_per_device=self.autosplit_reserve,
use_per_device=self.gpu_split,
callback=progress_callback,