From 60ae419746dca217367b876b2b34a8856f64b024 Mon Sep 17 00:00:00 2001 From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com> Date: Tue, 12 Aug 2025 21:01:54 +0000 Subject: [PATCH] Model.py TP changes --- backends/exllamav3/model.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 4f67fd3..05ae386 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -163,13 +163,19 @@ class ExllamaV3Container(BaseModelContainer): gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) gpu_split = unwrap(kwargs.get("gpu_split"), None) gpu_device_list = list(range(0, gpu_count)) + use_tp = unwrap(kwargs.get("tensor_parallel"), False) # Set GPU split options if gpu_count == 1: self.gpu_split_auto = False logger.info("Disabling GPU split because one GPU is in use.") else: - # TODO: Set tensor parallel + # Set tensor parallel + if use_tp: + self.use_tp = True + + # TP has its own autosplit loader + self.gpu_split_auto = False # Set GPU split options # Enable manual GPU split if provided @@ -451,6 +457,8 @@ class ExllamaV3Container(BaseModelContainer): yield value for value in self.model.load_gen( + tensor_p=self.use_tp, + tp_backend="native", reserve_per_device=self.autosplit_reserve, use_per_device=self.gpu_split, callback=progress_callback,