Merge branch 'exl3' into backend-detect
This commit is contained in:
commit
02a8d68e17
4 changed files with 68 additions and 8 deletions
|
|
@ -187,6 +187,15 @@ class ExllamaV2Container(BaseModelContainer):
|
||||||
# Get cache mode
|
# Get cache mode
|
||||||
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
|
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
|
||||||
|
|
||||||
|
# Catch exllamav3 cache_mode
|
||||||
|
if not self.cache_mode.startswith("Q"):
|
||||||
|
logger.warning(
|
||||||
|
f"Provided cache mode '{self.cache_mode}' is not a "
|
||||||
|
"valid choice for exllamav2, please check your settings. "
|
||||||
|
"Defaulting to FP16."
|
||||||
|
)
|
||||||
|
self.cache_mode = "FP16"
|
||||||
|
|
||||||
# Turn off GPU split if the user is using 1 GPU
|
# Turn off GPU split if the user is using 1 GPU
|
||||||
gpu_count = torch.cuda.device_count()
|
gpu_count = torch.cuda.device_count()
|
||||||
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||||
|
|
@ -392,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer):
|
||||||
# Set draft cache mode
|
# Set draft cache mode
|
||||||
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
|
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
|
||||||
|
|
||||||
|
# Catch exllamav3 draft_cache_mode
|
||||||
|
if not self.draft_cache_mode.startswith("Q"):
|
||||||
|
logger.warning(
|
||||||
|
f"Provided draft cache mode '{self.draft_cache_mode}' is not a "
|
||||||
|
"valid choice for exllamav2, please check your settings. "
|
||||||
|
"Defaulting to FP16."
|
||||||
|
)
|
||||||
|
self.draft_cache_mode = "FP16"
|
||||||
|
|
||||||
# Edit the draft config size
|
# Edit the draft config size
|
||||||
if chunk_size:
|
if chunk_size:
|
||||||
self.draft_config.max_input_len = chunk_size
|
self.draft_config.max_input_len = chunk_size
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import gc
|
import gc
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import re
|
||||||
import traceback
|
import traceback
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
|
|
@ -19,6 +20,7 @@ from exllamav3 import (
|
||||||
Model,
|
Model,
|
||||||
Tokenizer,
|
Tokenizer,
|
||||||
)
|
)
|
||||||
|
from exllamav3.cache import CacheLayer_quant
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from backends.base_model_container import BaseModelContainer
|
from backends.base_model_container import BaseModelContainer
|
||||||
|
|
@ -73,6 +75,8 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
use_tp: bool = False
|
use_tp: bool = False
|
||||||
max_seq_len: int = 4096
|
max_seq_len: int = 4096
|
||||||
cache_size: int = 4096
|
cache_size: int = 4096
|
||||||
|
cache_mode: str = "FP16"
|
||||||
|
draft_cache_mode: str = "FP16"
|
||||||
chunk_size: int = 2048
|
chunk_size: int = 2048
|
||||||
max_batch_size: Optional[int] = None
|
max_batch_size: Optional[int] = None
|
||||||
|
|
||||||
|
|
@ -164,7 +168,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
logger.info(f"Using draft model: {str(draft_model_path.resolve())}")
|
logger.info(f"Using draft model: {str(draft_model_path.resolve())}")
|
||||||
else:
|
else:
|
||||||
self.draft_model = None
|
self.draft_model = None
|
||||||
self.craft_cache = None
|
self.draft_cache = None
|
||||||
|
|
||||||
# Turn off GPU split if the user is using 1 GPU
|
# Turn off GPU split if the user is using 1 GPU
|
||||||
gpu_count = torch.cuda.device_count()
|
gpu_count = torch.cuda.device_count()
|
||||||
|
|
@ -217,11 +221,16 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
# Cache
|
# Cache
|
||||||
user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len)
|
user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len)
|
||||||
self.cache_size = self.adjust_cache_size(user_cache_size)
|
self.cache_size = self.adjust_cache_size(user_cache_size)
|
||||||
self.cache = Cache(self.model, max_num_tokens=self.cache_size)
|
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
|
||||||
|
self.cache = self.create_cache(self.cache_mode, self.model)
|
||||||
|
|
||||||
# Draft cache
|
# Draft cache
|
||||||
if self.use_draft_model:
|
if self.use_draft_model:
|
||||||
self.draft_cache = Cache(self.draft_model, max_num_tokens=self.cache_size)
|
# Set draft cache mode
|
||||||
|
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
|
||||||
|
self.draft_cache = self.create_cache(
|
||||||
|
self.draft_cache_mode, self.draft_model
|
||||||
|
)
|
||||||
|
|
||||||
# Max batch size
|
# Max batch size
|
||||||
self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256)
|
self.max_batch_size = unwrap(kwargs.get("max_batch_size"), 256)
|
||||||
|
|
@ -300,6 +309,33 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
|
|
||||||
return chunk_size
|
return chunk_size
|
||||||
|
|
||||||
|
def create_cache(self, raw_cache_mode: str, model: Model):
|
||||||
|
# Cast exl2 types to exl3
|
||||||
|
match raw_cache_mode:
|
||||||
|
case "Q4":
|
||||||
|
raw_cache_mode = "4,4"
|
||||||
|
case "Q6":
|
||||||
|
raw_cache_mode = "6,6"
|
||||||
|
case "Q8":
|
||||||
|
raw_cache_mode = "8,8"
|
||||||
|
|
||||||
|
split_cache_mode = re.search(r"^([2-8])\s*,\s*([2-8])$", raw_cache_mode)
|
||||||
|
|
||||||
|
if split_cache_mode:
|
||||||
|
draft_k_bits = int(split_cache_mode.group(1))
|
||||||
|
draft_v_bits = int(split_cache_mode.group(2))
|
||||||
|
cache = Cache(
|
||||||
|
model,
|
||||||
|
max_num_tokens=self.cache_size,
|
||||||
|
layer_type=CacheLayer_quant,
|
||||||
|
k_bits=draft_k_bits,
|
||||||
|
v_bits=draft_v_bits,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cache = Cache(model, max_num_tokens=self.cache_size)
|
||||||
|
|
||||||
|
return cache
|
||||||
|
|
||||||
def model_info(self) -> ModelCard:
|
def model_info(self) -> ModelCard:
|
||||||
"""
|
"""
|
||||||
Returns a dictionary of the current model's configuration parameters.
|
Returns a dictionary of the current model's configuration parameters.
|
||||||
|
|
@ -312,7 +348,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||||
max_seq_len=self.max_seq_len,
|
max_seq_len=self.max_seq_len,
|
||||||
cache_size=self.cache_size,
|
cache_size=self.cache_size,
|
||||||
max_batch_size=self.max_batch_size,
|
max_batch_size=self.max_batch_size,
|
||||||
# cache_mode=self.cache_mode,
|
cache_mode=self.cache_mode,
|
||||||
chunk_size=self.chunk_size,
|
chunk_size=self.chunk_size,
|
||||||
use_vision=self.use_vision,
|
use_vision=self.use_vision,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
from pydantic import (
|
from pydantic import (
|
||||||
BaseModel,
|
BaseModel,
|
||||||
ConfigDict,
|
ConfigDict,
|
||||||
|
constr,
|
||||||
Field,
|
Field,
|
||||||
PrivateAttr,
|
PrivateAttr,
|
||||||
field_validator,
|
field_validator,
|
||||||
|
|
@ -9,6 +10,7 @@ from typing import List, Literal, Optional, Union
|
||||||
|
|
||||||
|
|
||||||
CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
|
CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
|
||||||
|
CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8]\s*,\s*[2-8]$")]
|
||||||
|
|
||||||
|
|
||||||
class Metadata(BaseModel):
|
class Metadata(BaseModel):
|
||||||
|
|
@ -225,11 +227,13 @@ class ModelConfig(BaseConfigModel):
|
||||||
"or auto-calculate."
|
"or auto-calculate."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
cache_mode: Optional[CACHE_SIZES] = Field(
|
cache_mode: Optional[CACHE_TYPE] = Field(
|
||||||
"FP16",
|
"FP16",
|
||||||
description=(
|
description=(
|
||||||
"Enable different cache modes for VRAM savings (default: FP16).\n"
|
"Enable different cache modes for VRAM savings (default: FP16).\n"
|
||||||
f"Possible values: {str(CACHE_SIZES)[15:-1]}."
|
f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
|
||||||
|
"For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
|
||||||
|
"are integers from 2-8 (i.e. 8,8)."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
cache_size: Optional[int] = Field(
|
cache_size: Optional[int] = Field(
|
||||||
|
|
|
||||||
|
|
@ -114,7 +114,8 @@ model:
|
||||||
rope_alpha:
|
rope_alpha:
|
||||||
|
|
||||||
# Enable different cache modes for VRAM savings (default: FP16).
|
# Enable different cache modes for VRAM savings (default: FP16).
|
||||||
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
|
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
|
||||||
|
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
|
||||||
cache_mode: FP16
|
cache_mode: FP16
|
||||||
|
|
||||||
# Size of the prompt cache to allocate (default: max_seq_len).
|
# Size of the prompt cache to allocate (default: max_seq_len).
|
||||||
|
|
@ -164,7 +165,8 @@ draft_model:
|
||||||
draft_rope_alpha:
|
draft_rope_alpha:
|
||||||
|
|
||||||
# Cache mode for draft models to save VRAM (default: FP16).
|
# Cache mode for draft models to save VRAM (default: FP16).
|
||||||
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
|
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
|
||||||
|
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
|
||||||
draft_cache_mode: FP16
|
draft_cache_mode: FP16
|
||||||
|
|
||||||
# An integer array of GBs of VRAM to split between GPUs (default: []).
|
# An integer array of GBs of VRAM to split between GPUs (default: []).
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue