Model: Add Tensor Parallel support

Use the tensor parallel loader when the flag is enabled. The new loader has its own autosplit implementation, so gpu_split_auto isn't valid here. Also make it easier to determine which cache type to use rather than multiple if/else statements. Signed-off-by: kingbri <bdashore3@proton.me>
2024-08-16 16:35:19 -04:00 · 2024-08-16 16:35:19 -04:00 · 871c89063d
commit 871c89063d
parent 5002617eac
4 changed files with 109 additions and 53 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@ -50,6 +50,14 @@ from common.templating import (
 from common.transformers_utils import GenerationConfig, HuggingFaceConfig
 from common.utils import coalesce, unwrap

+# Dynamic imports
+try:
+    from exllamav2 import ExLlamaV2Cache_TP
+
+    has_tp = True
+except ImportError:
+    has_tp = False
+

 class ExllamaV2Container:
    """The model container class for ExLlamaV2 models."""
@ -78,6 +86,7 @@ class ExllamaV2Container:
    gpu_split: Optional[list] = None
    gpu_split_auto: bool = True
    autosplit_reserve: List[float] = [96 * 1024**2]
+    use_tp: bool = False

    # Load state
    model_is_loading: bool = False
@ -144,30 +153,52 @@ class ExllamaV2Container:
        # Turn off GPU split if the user is using 1 GPU
        gpu_count = torch.cuda.device_count()
        gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
+        use_tp = unwrap(kwargs.get("tensor_parallel"), False)
+        gpu_split = kwargs.get("gpu_split")
        gpu_device_list = list(range(0, gpu_count))

-        if gpu_count > 1 and gpu_split_auto:
-            # Auto GPU split parameters
-            self.gpu_split_auto = gpu_split_auto
-
-            autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
-            self.autosplit_reserve = [
-                int(math.ceil(value * 1024**2)) for value in autosplit_reserve_megabytes
-            ]
-        elif gpu_count > 1:
-            # Manual GPU split
-            self.gpu_split = kwargs.get("gpu_split")
-            self.gpu_split_auto = False
-
-            gpu_device_list = [
-                device_idx
-                for device_idx, memory in enumerate(self.gpu_split)
-                if memory > 0
-            ]
-        else:
-            # One GPU setup
+        # Set GPU split options
+        if gpu_count == 1:
            self.gpu_split_auto = False
            logger.info("Disabling GPU split because one GPU is in use.")
+        else:
+            # Set tensor parallel
+            if use_tp:
+                if has_tp:
+                    self.use_tp = True
+
+                    # TP has its own autosplit loader
+                    self.gpu_split_auto = False
+                else:
+                    # TODO: Remove conditional with exl2 v0.1.9 release
+                    logger.warning(
+                        "Tensor parallelism is not supported in the "
+                        "current ExllamaV2 version."
+                    )
+
+            # Enable manual GPU split if provided
+            if gpu_split:
+                self.gpu_split_auto = False
+                self.gpu_split = gpu_split
+
+                gpu_device_list = [
+                    device_idx
+                    for device_idx, memory in enumerate(self.gpu_split)
+                    if memory > 0
+                ]
+            elif gpu_split_auto and not self.use_tp:
+                # Otherwise fallback to autosplit settings
+                self.gpu_split_auto = gpu_split_auto
+
+                autosplit_reserve_megabytes = unwrap(
+                    kwargs.get("autosplit_reserve"), [96]
+                )
+
+                # Reserve VRAM for each GPU
+                self.autosplit_reserve = [
+                    int(math.ceil(value * 1024**2))
+                    for value in autosplit_reserve_megabytes
+                ]

        self.config = ExLlamaV2Config()
        self.config.model_dir = str(model_directory.resolve())
@ -182,10 +213,7 @@ class ExllamaV2Container:
        self.config.prepare()

        # Check if the model arch is compatible with various exl2 features
-        try:
-            self.config.arch_compat_overrides()
-        except AttributeError:
-            pass
+        self.config.arch_compat_overrides()

        # Create the hf_config
        self.hf_config = HuggingFaceConfig.from_file(model_directory)
@ -548,9 +576,11 @@ class ExllamaV2Container:
            if not self.quiet:
                logger.info("Loading draft model: " + self.draft_config.model_dir)

+            # Draft uses the autosplit loader, so create a cache that reflects this
            self.draft_cache = self.create_cache(
                cache_mode=self.draft_cache_mode,
                autosplit=True,
+                use_tp=False,
            )

            for value in self.draft_model.load_autosplit_gen(
@ -572,7 +602,17 @@ class ExllamaV2Container:

        # Load model with manual split
        # Entrypoint for single GPU users
-        if not self.gpu_split_auto:
+        if self.use_tp:
+            logger.info("Loading with tensor parallel")
+
+            for value in self.model.load_tp_gen(
+                self.gpu_split,
+                callback_gen=progress_callback,
+                expect_cache_tokens=self.cache_size,
+            ):
+                if value:
+                    yield value
+        elif not self.gpu_split_auto:
            logger.info("Loading with a manual GPU split (or a one GPU setup)")

            for value in self.model.load_gen(
@ -582,13 +622,15 @@ class ExllamaV2Container:
                if value:
                    yield value

+        # Create the model cache
        self.cache = self.create_cache(
            cache_mode=self.cache_mode,
            autosplit=self.gpu_split_auto,
+            use_tp=self.use_tp,
        )

-        # Load model with autosplit
-        if self.gpu_split_auto:
+        # Load model with autosplit (without TP)
+        if self.gpu_split_auto and not self.use_tp:
            logger.info("Loading with autosplit")

            for value in self.model.load_autosplit_gen(
@ -604,36 +646,35 @@ class ExllamaV2Container:
        input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
        self.model.forward(input_ids, cache=self.cache, preprocess_only=True)

-    def create_cache(self, cache_mode: str, autosplit: bool):
+    def create_cache(self, cache_mode: str, autosplit: bool, use_tp: bool):
+        if has_tp and use_tp:
+            if self.cache_mode != "FP16":
+                logger.warning(
+                    "Tensor parallel does not currently allow for use of "
+                    "a quantized K/V cache. Using the specialized TP cache."
+                )
+
+            return ExLlamaV2Cache_TP(
+                self.model,
+                max_seq_len=self.cache_size,
+                batch_size=1,
+            )
+
+        cache_type = ExLlamaV2Cache
        match cache_mode:
            case "Q4":
-                return ExLlamaV2Cache_Q4(
-                    self.model,
-                    max_seq_len=self.cache_size,
-                    lazy=autosplit,
-                    batch_size=1,
-                )
+                cache_type = ExLlamaV2Cache_Q4
            case "Q6":
-                return ExLlamaV2Cache_Q6(
-                    self.model,
-                    max_seq_len=self.cache_size,
-                    lazy=self.gpu_split_auto,
-                    batch_size=1,
-                )
+                cache_type = ExLlamaV2Cache_Q6
            case "Q8":
-                return ExLlamaV2Cache_Q8(
-                    self.model,
-                    max_seq_len=self.cache_size,
-                    lazy=autosplit,
-                    batch_size=1,
-                )
-            case _:
-                return ExLlamaV2Cache(
-                    self.model,
-                    max_seq_len=self.cache_size,
-                    lazy=self.gpu_split_auto,
-                    batch_size=1,
-                )
+                cache_type = ExLlamaV2Cache_Q8
+
+        return cache_type(
+            self.model,
+            max_seq_len=self.cache_size,
+            lazy=autosplit,
+            batch_size=1,
+        )

    async def create_generator(self):
        try:
--- a/common/args.py
+++ b/common/args.py
@ -107,6 +107,11 @@ def add_model_args(parser: argparse.ArgumentParser):
        type=str_to_bool,
        help="Overrides base model context length",
    )
+    model_group.add_argument(
+        "--tensor-parallel",
+        type=str_to_bool,
+        help="Use tensor parallelism to load models",
+    )
    model_group.add_argument(
        "--gpu-split-auto",
        type=str_to_bool,
--- a/config_sample.yml
+++ b/config_sample.yml
@ -109,6 +109,12 @@ model:
  # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
  #override_base_seq_len:

+  # Load model with tensor parallelism
+  # If a GPU split isn't provided, the TP loader will fallback to autosplit
+  # Enabling ignores the gpu_split_auto and autosplit_reserve values
+  # NOTE: Requires a development build of exllamav2
+  #tensor_parallel: False
+
  # Automatically allocate resources to GPUs (default: True)
  # NOTE: Not parsed for single GPU users
  #gpu_split_auto: True
@ -118,6 +124,7 @@ model:
  #autosplit_reserve: [96]

  # An integer array of GBs of vram to split between GPUs (default: [])
+  # Used with tensor parallelism
  # NOTE: Not parsed for single GPU users
  #gpu_split: [20.6, 24]

--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@ -96,6 +96,9 @@ class ModelLoadRequest(BaseModel):
        default_factory=lambda: get_config_default("cache_size"),
        examples=[4096],
    )
+    tensor_parallel: Optional[bool] = Field(
+        default_factory=lambda: get_config_default("tensor_parallel", False)
+    )
    gpu_split_auto: Optional[bool] = Field(
        default_factory=lambda: get_config_default("gpu_split_auto", True)
    )