From 871c89063d9f773f128c205a50fedeb9fa69b193 Mon Sep 17 00:00:00 2001 From: kingbri Date: Fri, 16 Aug 2024 16:35:19 -0400 Subject: [PATCH] Model: Add Tensor Parallel support Use the tensor parallel loader when the flag is enabled. The new loader has its own autosplit implementation, so gpu_split_auto isn't valid here. Also make it easier to determine which cache type to use rather than multiple if/else statements. Signed-off-by: kingbri --- backends/exllamav2/model.py | 147 ++++++++++++++++++++++------------ common/args.py | 5 ++ config_sample.yml | 7 ++ endpoints/core/types/model.py | 3 + 4 files changed, 109 insertions(+), 53 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 0648dbc..d472117 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -50,6 +50,14 @@ from common.templating import ( from common.transformers_utils import GenerationConfig, HuggingFaceConfig from common.utils import coalesce, unwrap +# Dynamic imports +try: + from exllamav2 import ExLlamaV2Cache_TP + + has_tp = True +except ImportError: + has_tp = False + class ExllamaV2Container: """The model container class for ExLlamaV2 models.""" @@ -78,6 +86,7 @@ class ExllamaV2Container: gpu_split: Optional[list] = None gpu_split_auto: bool = True autosplit_reserve: List[float] = [96 * 1024**2] + use_tp: bool = False # Load state model_is_loading: bool = False @@ -144,30 +153,52 @@ class ExllamaV2Container: # Turn off GPU split if the user is using 1 GPU gpu_count = torch.cuda.device_count() gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) + use_tp = unwrap(kwargs.get("tensor_parallel"), False) + gpu_split = kwargs.get("gpu_split") gpu_device_list = list(range(0, gpu_count)) - if gpu_count > 1 and gpu_split_auto: - # Auto GPU split parameters - self.gpu_split_auto = gpu_split_auto - - autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96]) - self.autosplit_reserve = [ - int(math.ceil(value * 1024**2)) for value in autosplit_reserve_megabytes - ] - elif gpu_count > 1: - # Manual GPU split - self.gpu_split = kwargs.get("gpu_split") - self.gpu_split_auto = False - - gpu_device_list = [ - device_idx - for device_idx, memory in enumerate(self.gpu_split) - if memory > 0 - ] - else: - # One GPU setup + # Set GPU split options + if gpu_count == 1: self.gpu_split_auto = False logger.info("Disabling GPU split because one GPU is in use.") + else: + # Set tensor parallel + if use_tp: + if has_tp: + self.use_tp = True + + # TP has its own autosplit loader + self.gpu_split_auto = False + else: + # TODO: Remove conditional with exl2 v0.1.9 release + logger.warning( + "Tensor parallelism is not supported in the " + "current ExllamaV2 version." + ) + + # Enable manual GPU split if provided + if gpu_split: + self.gpu_split_auto = False + self.gpu_split = gpu_split + + gpu_device_list = [ + device_idx + for device_idx, memory in enumerate(self.gpu_split) + if memory > 0 + ] + elif gpu_split_auto and not self.use_tp: + # Otherwise fallback to autosplit settings + self.gpu_split_auto = gpu_split_auto + + autosplit_reserve_megabytes = unwrap( + kwargs.get("autosplit_reserve"), [96] + ) + + # Reserve VRAM for each GPU + self.autosplit_reserve = [ + int(math.ceil(value * 1024**2)) + for value in autosplit_reserve_megabytes + ] self.config = ExLlamaV2Config() self.config.model_dir = str(model_directory.resolve()) @@ -182,10 +213,7 @@ class ExllamaV2Container: self.config.prepare() # Check if the model arch is compatible with various exl2 features - try: - self.config.arch_compat_overrides() - except AttributeError: - pass + self.config.arch_compat_overrides() # Create the hf_config self.hf_config = HuggingFaceConfig.from_file(model_directory) @@ -548,9 +576,11 @@ class ExllamaV2Container: if not self.quiet: logger.info("Loading draft model: " + self.draft_config.model_dir) + # Draft uses the autosplit loader, so create a cache that reflects this self.draft_cache = self.create_cache( cache_mode=self.draft_cache_mode, autosplit=True, + use_tp=False, ) for value in self.draft_model.load_autosplit_gen( @@ -572,7 +602,17 @@ class ExllamaV2Container: # Load model with manual split # Entrypoint for single GPU users - if not self.gpu_split_auto: + if self.use_tp: + logger.info("Loading with tensor parallel") + + for value in self.model.load_tp_gen( + self.gpu_split, + callback_gen=progress_callback, + expect_cache_tokens=self.cache_size, + ): + if value: + yield value + elif not self.gpu_split_auto: logger.info("Loading with a manual GPU split (or a one GPU setup)") for value in self.model.load_gen( @@ -582,13 +622,15 @@ class ExllamaV2Container: if value: yield value + # Create the model cache self.cache = self.create_cache( cache_mode=self.cache_mode, autosplit=self.gpu_split_auto, + use_tp=self.use_tp, ) - # Load model with autosplit - if self.gpu_split_auto: + # Load model with autosplit (without TP) + if self.gpu_split_auto and not self.use_tp: logger.info("Loading with autosplit") for value in self.model.load_autosplit_gen( @@ -604,36 +646,35 @@ class ExllamaV2Container: input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long) self.model.forward(input_ids, cache=self.cache, preprocess_only=True) - def create_cache(self, cache_mode: str, autosplit: bool): + def create_cache(self, cache_mode: str, autosplit: bool, use_tp: bool): + if has_tp and use_tp: + if self.cache_mode != "FP16": + logger.warning( + "Tensor parallel does not currently allow for use of " + "a quantized K/V cache. Using the specialized TP cache." + ) + + return ExLlamaV2Cache_TP( + self.model, + max_seq_len=self.cache_size, + batch_size=1, + ) + + cache_type = ExLlamaV2Cache match cache_mode: case "Q4": - return ExLlamaV2Cache_Q4( - self.model, - max_seq_len=self.cache_size, - lazy=autosplit, - batch_size=1, - ) + cache_type = ExLlamaV2Cache_Q4 case "Q6": - return ExLlamaV2Cache_Q6( - self.model, - max_seq_len=self.cache_size, - lazy=self.gpu_split_auto, - batch_size=1, - ) + cache_type = ExLlamaV2Cache_Q6 case "Q8": - return ExLlamaV2Cache_Q8( - self.model, - max_seq_len=self.cache_size, - lazy=autosplit, - batch_size=1, - ) - case _: - return ExLlamaV2Cache( - self.model, - max_seq_len=self.cache_size, - lazy=self.gpu_split_auto, - batch_size=1, - ) + cache_type = ExLlamaV2Cache_Q8 + + return cache_type( + self.model, + max_seq_len=self.cache_size, + lazy=autosplit, + batch_size=1, + ) async def create_generator(self): try: diff --git a/common/args.py b/common/args.py index a0f19c2..b879ec3 100644 --- a/common/args.py +++ b/common/args.py @@ -107,6 +107,11 @@ def add_model_args(parser: argparse.ArgumentParser): type=str_to_bool, help="Overrides base model context length", ) + model_group.add_argument( + "--tensor-parallel", + type=str_to_bool, + help="Use tensor parallelism to load models", + ) model_group.add_argument( "--gpu-split-auto", type=str_to_bool, diff --git a/config_sample.yml b/config_sample.yml index 018ff61..98f4fb7 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -109,6 +109,12 @@ model: # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B) #override_base_seq_len: + # Load model with tensor parallelism + # If a GPU split isn't provided, the TP loader will fallback to autosplit + # Enabling ignores the gpu_split_auto and autosplit_reserve values + # NOTE: Requires a development build of exllamav2 + #tensor_parallel: False + # Automatically allocate resources to GPUs (default: True) # NOTE: Not parsed for single GPU users #gpu_split_auto: True @@ -118,6 +124,7 @@ model: #autosplit_reserve: [96] # An integer array of GBs of vram to split between GPUs (default: []) + # Used with tensor parallelism # NOTE: Not parsed for single GPU users #gpu_split: [20.6, 24] diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 1e2eb46..c4959fe 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -96,6 +96,9 @@ class ModelLoadRequest(BaseModel): default_factory=lambda: get_config_default("cache_size"), examples=[4096], ) + tensor_parallel: Optional[bool] = Field( + default_factory=lambda: get_config_default("tensor_parallel", False) + ) gpu_split_auto: Optional[bool] = Field( default_factory=lambda: get_config_default("gpu_split_auto", True) )