Model: Add Tensor Parallel support

Use the tensor parallel loader when the flag is enabled. The new loader
has its own autosplit implementation, so gpu_split_auto isn't valid
here.

Also make it easier to determine which cache type to use rather than
multiple if/else statements.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri 2024-08-16 16:35:19 -04:00 committed by Brian Dashore
parent 5002617eac
commit 871c89063d
4 changed files with 109 additions and 53 deletions

View file

@ -50,6 +50,14 @@ from common.templating import (
from common.transformers_utils import GenerationConfig, HuggingFaceConfig
from common.utils import coalesce, unwrap
# Dynamic imports
try:
from exllamav2 import ExLlamaV2Cache_TP
has_tp = True
except ImportError:
has_tp = False
class ExllamaV2Container:
"""The model container class for ExLlamaV2 models."""
@ -78,6 +86,7 @@ class ExllamaV2Container:
gpu_split: Optional[list] = None
gpu_split_auto: bool = True
autosplit_reserve: List[float] = [96 * 1024**2]
use_tp: bool = False
# Load state
model_is_loading: bool = False
@ -144,30 +153,52 @@ class ExllamaV2Container:
# Turn off GPU split if the user is using 1 GPU
gpu_count = torch.cuda.device_count()
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
use_tp = unwrap(kwargs.get("tensor_parallel"), False)
gpu_split = kwargs.get("gpu_split")
gpu_device_list = list(range(0, gpu_count))
if gpu_count > 1 and gpu_split_auto:
# Auto GPU split parameters
self.gpu_split_auto = gpu_split_auto
autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
self.autosplit_reserve = [
int(math.ceil(value * 1024**2)) for value in autosplit_reserve_megabytes
]
elif gpu_count > 1:
# Manual GPU split
self.gpu_split = kwargs.get("gpu_split")
self.gpu_split_auto = False
gpu_device_list = [
device_idx
for device_idx, memory in enumerate(self.gpu_split)
if memory > 0
]
else:
# One GPU setup
# Set GPU split options
if gpu_count == 1:
self.gpu_split_auto = False
logger.info("Disabling GPU split because one GPU is in use.")
else:
# Set tensor parallel
if use_tp:
if has_tp:
self.use_tp = True
# TP has its own autosplit loader
self.gpu_split_auto = False
else:
# TODO: Remove conditional with exl2 v0.1.9 release
logger.warning(
"Tensor parallelism is not supported in the "
"current ExllamaV2 version."
)
# Enable manual GPU split if provided
if gpu_split:
self.gpu_split_auto = False
self.gpu_split = gpu_split
gpu_device_list = [
device_idx
for device_idx, memory in enumerate(self.gpu_split)
if memory > 0
]
elif gpu_split_auto and not self.use_tp:
# Otherwise fallback to autosplit settings
self.gpu_split_auto = gpu_split_auto
autosplit_reserve_megabytes = unwrap(
kwargs.get("autosplit_reserve"), [96]
)
# Reserve VRAM for each GPU
self.autosplit_reserve = [
int(math.ceil(value * 1024**2))
for value in autosplit_reserve_megabytes
]
self.config = ExLlamaV2Config()
self.config.model_dir = str(model_directory.resolve())
@ -182,10 +213,7 @@ class ExllamaV2Container:
self.config.prepare()
# Check if the model arch is compatible with various exl2 features
try:
self.config.arch_compat_overrides()
except AttributeError:
pass
self.config.arch_compat_overrides()
# Create the hf_config
self.hf_config = HuggingFaceConfig.from_file(model_directory)
@ -548,9 +576,11 @@ class ExllamaV2Container:
if not self.quiet:
logger.info("Loading draft model: " + self.draft_config.model_dir)
# Draft uses the autosplit loader, so create a cache that reflects this
self.draft_cache = self.create_cache(
cache_mode=self.draft_cache_mode,
autosplit=True,
use_tp=False,
)
for value in self.draft_model.load_autosplit_gen(
@ -572,7 +602,17 @@ class ExllamaV2Container:
# Load model with manual split
# Entrypoint for single GPU users
if not self.gpu_split_auto:
if self.use_tp:
logger.info("Loading with tensor parallel")
for value in self.model.load_tp_gen(
self.gpu_split,
callback_gen=progress_callback,
expect_cache_tokens=self.cache_size,
):
if value:
yield value
elif not self.gpu_split_auto:
logger.info("Loading with a manual GPU split (or a one GPU setup)")
for value in self.model.load_gen(
@ -582,13 +622,15 @@ class ExllamaV2Container:
if value:
yield value
# Create the model cache
self.cache = self.create_cache(
cache_mode=self.cache_mode,
autosplit=self.gpu_split_auto,
use_tp=self.use_tp,
)
# Load model with autosplit
if self.gpu_split_auto:
# Load model with autosplit (without TP)
if self.gpu_split_auto and not self.use_tp:
logger.info("Loading with autosplit")
for value in self.model.load_autosplit_gen(
@ -604,36 +646,35 @@ class ExllamaV2Container:
input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
self.model.forward(input_ids, cache=self.cache, preprocess_only=True)
def create_cache(self, cache_mode: str, autosplit: bool):
def create_cache(self, cache_mode: str, autosplit: bool, use_tp: bool):
if has_tp and use_tp:
if self.cache_mode != "FP16":
logger.warning(
"Tensor parallel does not currently allow for use of "
"a quantized K/V cache. Using the specialized TP cache."
)
return ExLlamaV2Cache_TP(
self.model,
max_seq_len=self.cache_size,
batch_size=1,
)
cache_type = ExLlamaV2Cache
match cache_mode:
case "Q4":
return ExLlamaV2Cache_Q4(
self.model,
max_seq_len=self.cache_size,
lazy=autosplit,
batch_size=1,
)
cache_type = ExLlamaV2Cache_Q4
case "Q6":
return ExLlamaV2Cache_Q6(
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
cache_type = ExLlamaV2Cache_Q6
case "Q8":
return ExLlamaV2Cache_Q8(
self.model,
max_seq_len=self.cache_size,
lazy=autosplit,
batch_size=1,
)
case _:
return ExLlamaV2Cache(
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
cache_type = ExLlamaV2Cache_Q8
return cache_type(
self.model,
max_seq_len=self.cache_size,
lazy=autosplit,
batch_size=1,
)
async def create_generator(self):
try:

View file

@ -107,6 +107,11 @@ def add_model_args(parser: argparse.ArgumentParser):
type=str_to_bool,
help="Overrides base model context length",
)
model_group.add_argument(
"--tensor-parallel",
type=str_to_bool,
help="Use tensor parallelism to load models",
)
model_group.add_argument(
"--gpu-split-auto",
type=str_to_bool,

View file

@ -109,6 +109,12 @@ model:
# Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
#override_base_seq_len:
# Load model with tensor parallelism
# If a GPU split isn't provided, the TP loader will fallback to autosplit
# Enabling ignores the gpu_split_auto and autosplit_reserve values
# NOTE: Requires a development build of exllamav2
#tensor_parallel: False
# Automatically allocate resources to GPUs (default: True)
# NOTE: Not parsed for single GPU users
#gpu_split_auto: True
@ -118,6 +124,7 @@ model:
#autosplit_reserve: [96]
# An integer array of GBs of vram to split between GPUs (default: [])
# Used with tensor parallelism
# NOTE: Not parsed for single GPU users
#gpu_split: [20.6, 24]

View file

@ -96,6 +96,9 @@ class ModelLoadRequest(BaseModel):
default_factory=lambda: get_config_default("cache_size"),
examples=[4096],
)
tensor_parallel: Optional[bool] = Field(
default_factory=lambda: get_config_default("tensor_parallel", False)
)
gpu_split_auto: Optional[bool] = Field(
default_factory=lambda: get_config_default("gpu_split_auto", True)
)