From 871c89063d9f773f128c205a50fedeb9fa69b193 Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Fri, 16 Aug 2024 16:35:19 -0400
Subject: [PATCH] Model: Add Tensor Parallel support

Use the tensor parallel loader when the flag is enabled. The new loader
has its own autosplit implementation, so gpu_split_auto isn't valid
here.

Also make it easier to determine which cache type to use rather than
multiple if/else statements.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 backends/exllamav2/model.py   | 147 ++++++++++++++++++++++------------
 common/args.py                |   5 ++
 config_sample.yml             |   7 ++
 endpoints/core/types/model.py |   3 +
 4 files changed, 109 insertions(+), 53 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 0648dbc..d472117 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -50,6 +50,14 @@ from common.templating import (
 from common.transformers_utils import GenerationConfig, HuggingFaceConfig
 from common.utils import coalesce, unwrap
 
+# Dynamic imports
+try:
+    from exllamav2 import ExLlamaV2Cache_TP
+
+    has_tp = True
+except ImportError:
+    has_tp = False
+
 
 class ExllamaV2Container:
     """The model container class for ExLlamaV2 models."""
@@ -78,6 +86,7 @@ class ExllamaV2Container:
     gpu_split: Optional[list] = None
     gpu_split_auto: bool = True
     autosplit_reserve: List[float] = [96 * 1024**2]
+    use_tp: bool = False
 
     # Load state
     model_is_loading: bool = False
@@ -144,30 +153,52 @@ class ExllamaV2Container:
         # Turn off GPU split if the user is using 1 GPU
         gpu_count = torch.cuda.device_count()
         gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
+        use_tp = unwrap(kwargs.get("tensor_parallel"), False)
+        gpu_split = kwargs.get("gpu_split")
         gpu_device_list = list(range(0, gpu_count))
 
-        if gpu_count > 1 and gpu_split_auto:
-            # Auto GPU split parameters
-            self.gpu_split_auto = gpu_split_auto
-
-            autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
-            self.autosplit_reserve = [
-                int(math.ceil(value * 1024**2)) for value in autosplit_reserve_megabytes
-            ]
-        elif gpu_count > 1:
-            # Manual GPU split
-            self.gpu_split = kwargs.get("gpu_split")
-            self.gpu_split_auto = False
-
-            gpu_device_list = [
-                device_idx
-                for device_idx, memory in enumerate(self.gpu_split)
-                if memory > 0
-            ]
-        else:
-            # One GPU setup
+        # Set GPU split options
+        if gpu_count == 1:
             self.gpu_split_auto = False
             logger.info("Disabling GPU split because one GPU is in use.")
+        else:
+            # Set tensor parallel
+            if use_tp:
+                if has_tp:
+                    self.use_tp = True
+
+                    # TP has its own autosplit loader
+                    self.gpu_split_auto = False
+                else:
+                    # TODO: Remove conditional with exl2 v0.1.9 release
+                    logger.warning(
+                        "Tensor parallelism is not supported in the "
+                        "current ExllamaV2 version."
+                    )
+
+            # Enable manual GPU split if provided
+            if gpu_split:
+                self.gpu_split_auto = False
+                self.gpu_split = gpu_split
+
+                gpu_device_list = [
+                    device_idx
+                    for device_idx, memory in enumerate(self.gpu_split)
+                    if memory > 0
+                ]
+            elif gpu_split_auto and not self.use_tp:
+                # Otherwise fallback to autosplit settings
+                self.gpu_split_auto = gpu_split_auto
+
+                autosplit_reserve_megabytes = unwrap(
+                    kwargs.get("autosplit_reserve"), [96]
+                )
+
+                # Reserve VRAM for each GPU
+                self.autosplit_reserve = [
+                    int(math.ceil(value * 1024**2))
+                    for value in autosplit_reserve_megabytes
+                ]
 
         self.config = ExLlamaV2Config()
         self.config.model_dir = str(model_directory.resolve())
@@ -182,10 +213,7 @@ class ExllamaV2Container:
         self.config.prepare()
 
         # Check if the model arch is compatible with various exl2 features
-        try:
-            self.config.arch_compat_overrides()
-        except AttributeError:
-            pass
+        self.config.arch_compat_overrides()
 
         # Create the hf_config
         self.hf_config = HuggingFaceConfig.from_file(model_directory)
@@ -548,9 +576,11 @@ class ExllamaV2Container:
             if not self.quiet:
                 logger.info("Loading draft model: " + self.draft_config.model_dir)
 
+            # Draft uses the autosplit loader, so create a cache that reflects this
             self.draft_cache = self.create_cache(
                 cache_mode=self.draft_cache_mode,
                 autosplit=True,
+                use_tp=False,
             )
 
             for value in self.draft_model.load_autosplit_gen(
@@ -572,7 +602,17 @@ class ExllamaV2Container:
 
         # Load model with manual split
         # Entrypoint for single GPU users
-        if not self.gpu_split_auto:
+        if self.use_tp:
+            logger.info("Loading with tensor parallel")
+
+            for value in self.model.load_tp_gen(
+                self.gpu_split,
+                callback_gen=progress_callback,
+                expect_cache_tokens=self.cache_size,
+            ):
+                if value:
+                    yield value
+        elif not self.gpu_split_auto:
             logger.info("Loading with a manual GPU split (or a one GPU setup)")
 
             for value in self.model.load_gen(
@@ -582,13 +622,15 @@ class ExllamaV2Container:
                 if value:
                     yield value
 
+        # Create the model cache
         self.cache = self.create_cache(
             cache_mode=self.cache_mode,
             autosplit=self.gpu_split_auto,
+            use_tp=self.use_tp,
         )
 
-        # Load model with autosplit
-        if self.gpu_split_auto:
+        # Load model with autosplit (without TP)
+        if self.gpu_split_auto and not self.use_tp:
             logger.info("Loading with autosplit")
 
             for value in self.model.load_autosplit_gen(
@@ -604,36 +646,35 @@ class ExllamaV2Container:
         input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
         self.model.forward(input_ids, cache=self.cache, preprocess_only=True)
 
-    def create_cache(self, cache_mode: str, autosplit: bool):
+    def create_cache(self, cache_mode: str, autosplit: bool, use_tp: bool):
+        if has_tp and use_tp:
+            if self.cache_mode != "FP16":
+                logger.warning(
+                    "Tensor parallel does not currently allow for use of "
+                    "a quantized K/V cache. Using the specialized TP cache."
+                )
+
+            return ExLlamaV2Cache_TP(
+                self.model,
+                max_seq_len=self.cache_size,
+                batch_size=1,
+            )
+
+        cache_type = ExLlamaV2Cache
         match cache_mode:
             case "Q4":
-                return ExLlamaV2Cache_Q4(
-                    self.model,
-                    max_seq_len=self.cache_size,
-                    lazy=autosplit,
-                    batch_size=1,
-                )
+                cache_type = ExLlamaV2Cache_Q4
             case "Q6":
-                return ExLlamaV2Cache_Q6(
-                    self.model,
-                    max_seq_len=self.cache_size,
-                    lazy=self.gpu_split_auto,
-                    batch_size=1,
-                )
+                cache_type = ExLlamaV2Cache_Q6
             case "Q8":
-                return ExLlamaV2Cache_Q8(
-                    self.model,
-                    max_seq_len=self.cache_size,
-                    lazy=autosplit,
-                    batch_size=1,
-                )
-            case _:
-                return ExLlamaV2Cache(
-                    self.model,
-                    max_seq_len=self.cache_size,
-                    lazy=self.gpu_split_auto,
-                    batch_size=1,
-                )
+                cache_type = ExLlamaV2Cache_Q8
+
+        return cache_type(
+            self.model,
+            max_seq_len=self.cache_size,
+            lazy=autosplit,
+            batch_size=1,
+        )
 
     async def create_generator(self):
         try:
diff --git a/common/args.py b/common/args.py
index a0f19c2..b879ec3 100644
--- a/common/args.py
+++ b/common/args.py
@@ -107,6 +107,11 @@ def add_model_args(parser: argparse.ArgumentParser):
         type=str_to_bool,
         help="Overrides base model context length",
     )
+    model_group.add_argument(
+        "--tensor-parallel",
+        type=str_to_bool,
+        help="Use tensor parallelism to load models",
+    )
     model_group.add_argument(
         "--gpu-split-auto",
         type=str_to_bool,
diff --git a/config_sample.yml b/config_sample.yml
index 018ff61..98f4fb7 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -109,6 +109,12 @@ model:
   # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
   #override_base_seq_len:
 
+  # Load model with tensor parallelism
+  # If a GPU split isn't provided, the TP loader will fallback to autosplit
+  # Enabling ignores the gpu_split_auto and autosplit_reserve values
+  # NOTE: Requires a development build of exllamav2
+  #tensor_parallel: False
+
   # Automatically allocate resources to GPUs (default: True)
   # NOTE: Not parsed for single GPU users
   #gpu_split_auto: True
@@ -118,6 +124,7 @@ model:
   #autosplit_reserve: [96]
 
   # An integer array of GBs of vram to split between GPUs (default: [])
+  # Used with tensor parallelism
   # NOTE: Not parsed for single GPU users
   #gpu_split: [20.6, 24]
 
diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py
index 1e2eb46..c4959fe 100644
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -96,6 +96,9 @@ class ModelLoadRequest(BaseModel):
         default_factory=lambda: get_config_default("cache_size"),
         examples=[4096],
     )
+    tensor_parallel: Optional[bool] = Field(
+        default_factory=lambda: get_config_default("tensor_parallel", False)
+    )
     gpu_split_auto: Optional[bool] = Field(
         default_factory=lambda: get_config_default("gpu_split_auto", True)
     )