Model: Add Tensor Parallel support

Use the tensor parallel loader when the flag is enabled. The new loader
has its own autosplit implementation, so gpu_split_auto isn't valid
here.

Also make it easier to determine which cache type to use rather than
multiple if/else statements.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri 2024-08-16 16:35:19 -04:00 committed by Brian Dashore
parent 5002617eac
commit 871c89063d
4 changed files with 109 additions and 53 deletions

View file

@ -50,6 +50,14 @@ from common.templating import (
from common.transformers_utils import GenerationConfig, HuggingFaceConfig from common.transformers_utils import GenerationConfig, HuggingFaceConfig
from common.utils import coalesce, unwrap from common.utils import coalesce, unwrap
# Dynamic imports
try:
from exllamav2 import ExLlamaV2Cache_TP
has_tp = True
except ImportError:
has_tp = False
class ExllamaV2Container: class ExllamaV2Container:
"""The model container class for ExLlamaV2 models.""" """The model container class for ExLlamaV2 models."""
@ -78,6 +86,7 @@ class ExllamaV2Container:
gpu_split: Optional[list] = None gpu_split: Optional[list] = None
gpu_split_auto: bool = True gpu_split_auto: bool = True
autosplit_reserve: List[float] = [96 * 1024**2] autosplit_reserve: List[float] = [96 * 1024**2]
use_tp: bool = False
# Load state # Load state
model_is_loading: bool = False model_is_loading: bool = False
@ -144,30 +153,52 @@ class ExllamaV2Container:
# Turn off GPU split if the user is using 1 GPU # Turn off GPU split if the user is using 1 GPU
gpu_count = torch.cuda.device_count() gpu_count = torch.cuda.device_count()
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
use_tp = unwrap(kwargs.get("tensor_parallel"), False)
gpu_split = kwargs.get("gpu_split")
gpu_device_list = list(range(0, gpu_count)) gpu_device_list = list(range(0, gpu_count))
if gpu_count > 1 and gpu_split_auto: # Set GPU split options
# Auto GPU split parameters if gpu_count == 1:
self.gpu_split_auto = gpu_split_auto
autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
self.autosplit_reserve = [
int(math.ceil(value * 1024**2)) for value in autosplit_reserve_megabytes
]
elif gpu_count > 1:
# Manual GPU split
self.gpu_split = kwargs.get("gpu_split")
self.gpu_split_auto = False
gpu_device_list = [
device_idx
for device_idx, memory in enumerate(self.gpu_split)
if memory > 0
]
else:
# One GPU setup
self.gpu_split_auto = False self.gpu_split_auto = False
logger.info("Disabling GPU split because one GPU is in use.") logger.info("Disabling GPU split because one GPU is in use.")
else:
# Set tensor parallel
if use_tp:
if has_tp:
self.use_tp = True
# TP has its own autosplit loader
self.gpu_split_auto = False
else:
# TODO: Remove conditional with exl2 v0.1.9 release
logger.warning(
"Tensor parallelism is not supported in the "
"current ExllamaV2 version."
)
# Enable manual GPU split if provided
if gpu_split:
self.gpu_split_auto = False
self.gpu_split = gpu_split
gpu_device_list = [
device_idx
for device_idx, memory in enumerate(self.gpu_split)
if memory > 0
]
elif gpu_split_auto and not self.use_tp:
# Otherwise fallback to autosplit settings
self.gpu_split_auto = gpu_split_auto
autosplit_reserve_megabytes = unwrap(
kwargs.get("autosplit_reserve"), [96]
)
# Reserve VRAM for each GPU
self.autosplit_reserve = [
int(math.ceil(value * 1024**2))
for value in autosplit_reserve_megabytes
]
self.config = ExLlamaV2Config() self.config = ExLlamaV2Config()
self.config.model_dir = str(model_directory.resolve()) self.config.model_dir = str(model_directory.resolve())
@ -182,10 +213,7 @@ class ExllamaV2Container:
self.config.prepare() self.config.prepare()
# Check if the model arch is compatible with various exl2 features # Check if the model arch is compatible with various exl2 features
try: self.config.arch_compat_overrides()
self.config.arch_compat_overrides()
except AttributeError:
pass
# Create the hf_config # Create the hf_config
self.hf_config = HuggingFaceConfig.from_file(model_directory) self.hf_config = HuggingFaceConfig.from_file(model_directory)
@ -548,9 +576,11 @@ class ExllamaV2Container:
if not self.quiet: if not self.quiet:
logger.info("Loading draft model: " + self.draft_config.model_dir) logger.info("Loading draft model: " + self.draft_config.model_dir)
# Draft uses the autosplit loader, so create a cache that reflects this
self.draft_cache = self.create_cache( self.draft_cache = self.create_cache(
cache_mode=self.draft_cache_mode, cache_mode=self.draft_cache_mode,
autosplit=True, autosplit=True,
use_tp=False,
) )
for value in self.draft_model.load_autosplit_gen( for value in self.draft_model.load_autosplit_gen(
@ -572,7 +602,17 @@ class ExllamaV2Container:
# Load model with manual split # Load model with manual split
# Entrypoint for single GPU users # Entrypoint for single GPU users
if not self.gpu_split_auto: if self.use_tp:
logger.info("Loading with tensor parallel")
for value in self.model.load_tp_gen(
self.gpu_split,
callback_gen=progress_callback,
expect_cache_tokens=self.cache_size,
):
if value:
yield value
elif not self.gpu_split_auto:
logger.info("Loading with a manual GPU split (or a one GPU setup)") logger.info("Loading with a manual GPU split (or a one GPU setup)")
for value in self.model.load_gen( for value in self.model.load_gen(
@ -582,13 +622,15 @@ class ExllamaV2Container:
if value: if value:
yield value yield value
# Create the model cache
self.cache = self.create_cache( self.cache = self.create_cache(
cache_mode=self.cache_mode, cache_mode=self.cache_mode,
autosplit=self.gpu_split_auto, autosplit=self.gpu_split_auto,
use_tp=self.use_tp,
) )
# Load model with autosplit # Load model with autosplit (without TP)
if self.gpu_split_auto: if self.gpu_split_auto and not self.use_tp:
logger.info("Loading with autosplit") logger.info("Loading with autosplit")
for value in self.model.load_autosplit_gen( for value in self.model.load_autosplit_gen(
@ -604,36 +646,35 @@ class ExllamaV2Container:
input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long) input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
self.model.forward(input_ids, cache=self.cache, preprocess_only=True) self.model.forward(input_ids, cache=self.cache, preprocess_only=True)
def create_cache(self, cache_mode: str, autosplit: bool): def create_cache(self, cache_mode: str, autosplit: bool, use_tp: bool):
if has_tp and use_tp:
if self.cache_mode != "FP16":
logger.warning(
"Tensor parallel does not currently allow for use of "
"a quantized K/V cache. Using the specialized TP cache."
)
return ExLlamaV2Cache_TP(
self.model,
max_seq_len=self.cache_size,
batch_size=1,
)
cache_type = ExLlamaV2Cache
match cache_mode: match cache_mode:
case "Q4": case "Q4":
return ExLlamaV2Cache_Q4( cache_type = ExLlamaV2Cache_Q4
self.model,
max_seq_len=self.cache_size,
lazy=autosplit,
batch_size=1,
)
case "Q6": case "Q6":
return ExLlamaV2Cache_Q6( cache_type = ExLlamaV2Cache_Q6
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
case "Q8": case "Q8":
return ExLlamaV2Cache_Q8( cache_type = ExLlamaV2Cache_Q8
self.model,
max_seq_len=self.cache_size, return cache_type(
lazy=autosplit, self.model,
batch_size=1, max_seq_len=self.cache_size,
) lazy=autosplit,
case _: batch_size=1,
return ExLlamaV2Cache( )
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
async def create_generator(self): async def create_generator(self):
try: try:

View file

@ -107,6 +107,11 @@ def add_model_args(parser: argparse.ArgumentParser):
type=str_to_bool, type=str_to_bool,
help="Overrides base model context length", help="Overrides base model context length",
) )
model_group.add_argument(
"--tensor-parallel",
type=str_to_bool,
help="Use tensor parallelism to load models",
)
model_group.add_argument( model_group.add_argument(
"--gpu-split-auto", "--gpu-split-auto",
type=str_to_bool, type=str_to_bool,

View file

@ -109,6 +109,12 @@ model:
# Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B) # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
#override_base_seq_len: #override_base_seq_len:
# Load model with tensor parallelism
# If a GPU split isn't provided, the TP loader will fallback to autosplit
# Enabling ignores the gpu_split_auto and autosplit_reserve values
# NOTE: Requires a development build of exllamav2
#tensor_parallel: False
# Automatically allocate resources to GPUs (default: True) # Automatically allocate resources to GPUs (default: True)
# NOTE: Not parsed for single GPU users # NOTE: Not parsed for single GPU users
#gpu_split_auto: True #gpu_split_auto: True
@ -118,6 +124,7 @@ model:
#autosplit_reserve: [96] #autosplit_reserve: [96]
# An integer array of GBs of vram to split between GPUs (default: []) # An integer array of GBs of vram to split between GPUs (default: [])
# Used with tensor parallelism
# NOTE: Not parsed for single GPU users # NOTE: Not parsed for single GPU users
#gpu_split: [20.6, 24] #gpu_split: [20.6, 24]

View file

@ -96,6 +96,9 @@ class ModelLoadRequest(BaseModel):
default_factory=lambda: get_config_default("cache_size"), default_factory=lambda: get_config_default("cache_size"),
examples=[4096], examples=[4096],
) )
tensor_parallel: Optional[bool] = Field(
default_factory=lambda: get_config_default("tensor_parallel", False)
)
gpu_split_auto: Optional[bool] = Field( gpu_split_auto: Optional[bool] = Field(
default_factory=lambda: get_config_default("gpu_split_auto", True) default_factory=lambda: get_config_default("gpu_split_auto", True)
) )