Model: Add Tensor Parallel support
Use the tensor parallel loader when the flag is enabled. The new loader has its own autosplit implementation, so gpu_split_auto isn't valid here. Also make it easier to determine which cache type to use rather than multiple if/else statements. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
parent
5002617eac
commit
871c89063d
4 changed files with 109 additions and 53 deletions
|
|
@ -50,6 +50,14 @@ from common.templating import (
|
||||||
from common.transformers_utils import GenerationConfig, HuggingFaceConfig
|
from common.transformers_utils import GenerationConfig, HuggingFaceConfig
|
||||||
from common.utils import coalesce, unwrap
|
from common.utils import coalesce, unwrap
|
||||||
|
|
||||||
|
# Dynamic imports
|
||||||
|
try:
|
||||||
|
from exllamav2 import ExLlamaV2Cache_TP
|
||||||
|
|
||||||
|
has_tp = True
|
||||||
|
except ImportError:
|
||||||
|
has_tp = False
|
||||||
|
|
||||||
|
|
||||||
class ExllamaV2Container:
|
class ExllamaV2Container:
|
||||||
"""The model container class for ExLlamaV2 models."""
|
"""The model container class for ExLlamaV2 models."""
|
||||||
|
|
@ -78,6 +86,7 @@ class ExllamaV2Container:
|
||||||
gpu_split: Optional[list] = None
|
gpu_split: Optional[list] = None
|
||||||
gpu_split_auto: bool = True
|
gpu_split_auto: bool = True
|
||||||
autosplit_reserve: List[float] = [96 * 1024**2]
|
autosplit_reserve: List[float] = [96 * 1024**2]
|
||||||
|
use_tp: bool = False
|
||||||
|
|
||||||
# Load state
|
# Load state
|
||||||
model_is_loading: bool = False
|
model_is_loading: bool = False
|
||||||
|
|
@ -144,30 +153,52 @@ class ExllamaV2Container:
|
||||||
# Turn off GPU split if the user is using 1 GPU
|
# Turn off GPU split if the user is using 1 GPU
|
||||||
gpu_count = torch.cuda.device_count()
|
gpu_count = torch.cuda.device_count()
|
||||||
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||||
|
use_tp = unwrap(kwargs.get("tensor_parallel"), False)
|
||||||
|
gpu_split = kwargs.get("gpu_split")
|
||||||
gpu_device_list = list(range(0, gpu_count))
|
gpu_device_list = list(range(0, gpu_count))
|
||||||
|
|
||||||
if gpu_count > 1 and gpu_split_auto:
|
# Set GPU split options
|
||||||
# Auto GPU split parameters
|
if gpu_count == 1:
|
||||||
self.gpu_split_auto = gpu_split_auto
|
|
||||||
|
|
||||||
autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
|
|
||||||
self.autosplit_reserve = [
|
|
||||||
int(math.ceil(value * 1024**2)) for value in autosplit_reserve_megabytes
|
|
||||||
]
|
|
||||||
elif gpu_count > 1:
|
|
||||||
# Manual GPU split
|
|
||||||
self.gpu_split = kwargs.get("gpu_split")
|
|
||||||
self.gpu_split_auto = False
|
|
||||||
|
|
||||||
gpu_device_list = [
|
|
||||||
device_idx
|
|
||||||
for device_idx, memory in enumerate(self.gpu_split)
|
|
||||||
if memory > 0
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
# One GPU setup
|
|
||||||
self.gpu_split_auto = False
|
self.gpu_split_auto = False
|
||||||
logger.info("Disabling GPU split because one GPU is in use.")
|
logger.info("Disabling GPU split because one GPU is in use.")
|
||||||
|
else:
|
||||||
|
# Set tensor parallel
|
||||||
|
if use_tp:
|
||||||
|
if has_tp:
|
||||||
|
self.use_tp = True
|
||||||
|
|
||||||
|
# TP has its own autosplit loader
|
||||||
|
self.gpu_split_auto = False
|
||||||
|
else:
|
||||||
|
# TODO: Remove conditional with exl2 v0.1.9 release
|
||||||
|
logger.warning(
|
||||||
|
"Tensor parallelism is not supported in the "
|
||||||
|
"current ExllamaV2 version."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enable manual GPU split if provided
|
||||||
|
if gpu_split:
|
||||||
|
self.gpu_split_auto = False
|
||||||
|
self.gpu_split = gpu_split
|
||||||
|
|
||||||
|
gpu_device_list = [
|
||||||
|
device_idx
|
||||||
|
for device_idx, memory in enumerate(self.gpu_split)
|
||||||
|
if memory > 0
|
||||||
|
]
|
||||||
|
elif gpu_split_auto and not self.use_tp:
|
||||||
|
# Otherwise fallback to autosplit settings
|
||||||
|
self.gpu_split_auto = gpu_split_auto
|
||||||
|
|
||||||
|
autosplit_reserve_megabytes = unwrap(
|
||||||
|
kwargs.get("autosplit_reserve"), [96]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reserve VRAM for each GPU
|
||||||
|
self.autosplit_reserve = [
|
||||||
|
int(math.ceil(value * 1024**2))
|
||||||
|
for value in autosplit_reserve_megabytes
|
||||||
|
]
|
||||||
|
|
||||||
self.config = ExLlamaV2Config()
|
self.config = ExLlamaV2Config()
|
||||||
self.config.model_dir = str(model_directory.resolve())
|
self.config.model_dir = str(model_directory.resolve())
|
||||||
|
|
@ -182,10 +213,7 @@ class ExllamaV2Container:
|
||||||
self.config.prepare()
|
self.config.prepare()
|
||||||
|
|
||||||
# Check if the model arch is compatible with various exl2 features
|
# Check if the model arch is compatible with various exl2 features
|
||||||
try:
|
self.config.arch_compat_overrides()
|
||||||
self.config.arch_compat_overrides()
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Create the hf_config
|
# Create the hf_config
|
||||||
self.hf_config = HuggingFaceConfig.from_file(model_directory)
|
self.hf_config = HuggingFaceConfig.from_file(model_directory)
|
||||||
|
|
@ -548,9 +576,11 @@ class ExllamaV2Container:
|
||||||
if not self.quiet:
|
if not self.quiet:
|
||||||
logger.info("Loading draft model: " + self.draft_config.model_dir)
|
logger.info("Loading draft model: " + self.draft_config.model_dir)
|
||||||
|
|
||||||
|
# Draft uses the autosplit loader, so create a cache that reflects this
|
||||||
self.draft_cache = self.create_cache(
|
self.draft_cache = self.create_cache(
|
||||||
cache_mode=self.draft_cache_mode,
|
cache_mode=self.draft_cache_mode,
|
||||||
autosplit=True,
|
autosplit=True,
|
||||||
|
use_tp=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
for value in self.draft_model.load_autosplit_gen(
|
for value in self.draft_model.load_autosplit_gen(
|
||||||
|
|
@ -572,7 +602,17 @@ class ExllamaV2Container:
|
||||||
|
|
||||||
# Load model with manual split
|
# Load model with manual split
|
||||||
# Entrypoint for single GPU users
|
# Entrypoint for single GPU users
|
||||||
if not self.gpu_split_auto:
|
if self.use_tp:
|
||||||
|
logger.info("Loading with tensor parallel")
|
||||||
|
|
||||||
|
for value in self.model.load_tp_gen(
|
||||||
|
self.gpu_split,
|
||||||
|
callback_gen=progress_callback,
|
||||||
|
expect_cache_tokens=self.cache_size,
|
||||||
|
):
|
||||||
|
if value:
|
||||||
|
yield value
|
||||||
|
elif not self.gpu_split_auto:
|
||||||
logger.info("Loading with a manual GPU split (or a one GPU setup)")
|
logger.info("Loading with a manual GPU split (or a one GPU setup)")
|
||||||
|
|
||||||
for value in self.model.load_gen(
|
for value in self.model.load_gen(
|
||||||
|
|
@ -582,13 +622,15 @@ class ExllamaV2Container:
|
||||||
if value:
|
if value:
|
||||||
yield value
|
yield value
|
||||||
|
|
||||||
|
# Create the model cache
|
||||||
self.cache = self.create_cache(
|
self.cache = self.create_cache(
|
||||||
cache_mode=self.cache_mode,
|
cache_mode=self.cache_mode,
|
||||||
autosplit=self.gpu_split_auto,
|
autosplit=self.gpu_split_auto,
|
||||||
|
use_tp=self.use_tp,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Load model with autosplit
|
# Load model with autosplit (without TP)
|
||||||
if self.gpu_split_auto:
|
if self.gpu_split_auto and not self.use_tp:
|
||||||
logger.info("Loading with autosplit")
|
logger.info("Loading with autosplit")
|
||||||
|
|
||||||
for value in self.model.load_autosplit_gen(
|
for value in self.model.load_autosplit_gen(
|
||||||
|
|
@ -604,36 +646,35 @@ class ExllamaV2Container:
|
||||||
input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
|
input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
|
||||||
self.model.forward(input_ids, cache=self.cache, preprocess_only=True)
|
self.model.forward(input_ids, cache=self.cache, preprocess_only=True)
|
||||||
|
|
||||||
def create_cache(self, cache_mode: str, autosplit: bool):
|
def create_cache(self, cache_mode: str, autosplit: bool, use_tp: bool):
|
||||||
|
if has_tp and use_tp:
|
||||||
|
if self.cache_mode != "FP16":
|
||||||
|
logger.warning(
|
||||||
|
"Tensor parallel does not currently allow for use of "
|
||||||
|
"a quantized K/V cache. Using the specialized TP cache."
|
||||||
|
)
|
||||||
|
|
||||||
|
return ExLlamaV2Cache_TP(
|
||||||
|
self.model,
|
||||||
|
max_seq_len=self.cache_size,
|
||||||
|
batch_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
cache_type = ExLlamaV2Cache
|
||||||
match cache_mode:
|
match cache_mode:
|
||||||
case "Q4":
|
case "Q4":
|
||||||
return ExLlamaV2Cache_Q4(
|
cache_type = ExLlamaV2Cache_Q4
|
||||||
self.model,
|
|
||||||
max_seq_len=self.cache_size,
|
|
||||||
lazy=autosplit,
|
|
||||||
batch_size=1,
|
|
||||||
)
|
|
||||||
case "Q6":
|
case "Q6":
|
||||||
return ExLlamaV2Cache_Q6(
|
cache_type = ExLlamaV2Cache_Q6
|
||||||
self.model,
|
|
||||||
max_seq_len=self.cache_size,
|
|
||||||
lazy=self.gpu_split_auto,
|
|
||||||
batch_size=1,
|
|
||||||
)
|
|
||||||
case "Q8":
|
case "Q8":
|
||||||
return ExLlamaV2Cache_Q8(
|
cache_type = ExLlamaV2Cache_Q8
|
||||||
self.model,
|
|
||||||
max_seq_len=self.cache_size,
|
return cache_type(
|
||||||
lazy=autosplit,
|
self.model,
|
||||||
batch_size=1,
|
max_seq_len=self.cache_size,
|
||||||
)
|
lazy=autosplit,
|
||||||
case _:
|
batch_size=1,
|
||||||
return ExLlamaV2Cache(
|
)
|
||||||
self.model,
|
|
||||||
max_seq_len=self.cache_size,
|
|
||||||
lazy=self.gpu_split_auto,
|
|
||||||
batch_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def create_generator(self):
|
async def create_generator(self):
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -107,6 +107,11 @@ def add_model_args(parser: argparse.ArgumentParser):
|
||||||
type=str_to_bool,
|
type=str_to_bool,
|
||||||
help="Overrides base model context length",
|
help="Overrides base model context length",
|
||||||
)
|
)
|
||||||
|
model_group.add_argument(
|
||||||
|
"--tensor-parallel",
|
||||||
|
type=str_to_bool,
|
||||||
|
help="Use tensor parallelism to load models",
|
||||||
|
)
|
||||||
model_group.add_argument(
|
model_group.add_argument(
|
||||||
"--gpu-split-auto",
|
"--gpu-split-auto",
|
||||||
type=str_to_bool,
|
type=str_to_bool,
|
||||||
|
|
|
||||||
|
|
@ -109,6 +109,12 @@ model:
|
||||||
# Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
|
# Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
|
||||||
#override_base_seq_len:
|
#override_base_seq_len:
|
||||||
|
|
||||||
|
# Load model with tensor parallelism
|
||||||
|
# If a GPU split isn't provided, the TP loader will fallback to autosplit
|
||||||
|
# Enabling ignores the gpu_split_auto and autosplit_reserve values
|
||||||
|
# NOTE: Requires a development build of exllamav2
|
||||||
|
#tensor_parallel: False
|
||||||
|
|
||||||
# Automatically allocate resources to GPUs (default: True)
|
# Automatically allocate resources to GPUs (default: True)
|
||||||
# NOTE: Not parsed for single GPU users
|
# NOTE: Not parsed for single GPU users
|
||||||
#gpu_split_auto: True
|
#gpu_split_auto: True
|
||||||
|
|
@ -118,6 +124,7 @@ model:
|
||||||
#autosplit_reserve: [96]
|
#autosplit_reserve: [96]
|
||||||
|
|
||||||
# An integer array of GBs of vram to split between GPUs (default: [])
|
# An integer array of GBs of vram to split between GPUs (default: [])
|
||||||
|
# Used with tensor parallelism
|
||||||
# NOTE: Not parsed for single GPU users
|
# NOTE: Not parsed for single GPU users
|
||||||
#gpu_split: [20.6, 24]
|
#gpu_split: [20.6, 24]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -96,6 +96,9 @@ class ModelLoadRequest(BaseModel):
|
||||||
default_factory=lambda: get_config_default("cache_size"),
|
default_factory=lambda: get_config_default("cache_size"),
|
||||||
examples=[4096],
|
examples=[4096],
|
||||||
)
|
)
|
||||||
|
tensor_parallel: Optional[bool] = Field(
|
||||||
|
default_factory=lambda: get_config_default("tensor_parallel", False)
|
||||||
|
)
|
||||||
gpu_split_auto: Optional[bool] = Field(
|
gpu_split_auto: Optional[bool] = Field(
|
||||||
default_factory=lambda: get_config_default("gpu_split_auto", True)
|
default_factory=lambda: get_config_default("gpu_split_auto", True)
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue