API + Model: Add support for specifying k/v cache size
This commit is contained in:
parent
d710a1b441
commit
767e6a798a
4 changed files with 51 additions and 5 deletions
|
|
@ -58,6 +58,7 @@ class ExllamaV2Container:
|
|||
paged: bool = True
|
||||
|
||||
# Internal config vars
|
||||
cache_size: int = None
|
||||
cache_mode: str = "FP16"
|
||||
max_batch_size: int = 20
|
||||
generation_config: Optional[GenerationConfig] = None
|
||||
|
|
@ -91,9 +92,11 @@ class ExllamaV2Container:
|
|||
loading_draft: bool)
|
||||
**kwargs:
|
||||
`cache_mode` (str): Sets cache mode, "FP16" or "FP8"
|
||||
(defaulf: "FP16")
|
||||
(default: "FP16")
|
||||
'max_seq_len' (int): Override model's default max sequence
|
||||
length (default: 4096)
|
||||
'cache_size' (int): Num of tokens to allocate space for in the k/v cache
|
||||
(default: max_seq_len)
|
||||
'rope_scale' (float): Set RoPE scaling factor for model
|
||||
(default: 1.0)
|
||||
'rope_alpha' (float): Set RoPE alpha (NTK) factor for model
|
||||
|
|
@ -189,6 +192,16 @@ class ExllamaV2Container:
|
|||
kwargs.get("rope_alpha"), self.calculate_rope_alpha(base_seq_len)
|
||||
)
|
||||
|
||||
# Set k/v cache size
|
||||
self.cache_size = unwrap(kwargs.get("cache_size"), self.config.max_seq_len)
|
||||
if self.cache_size < self.config.max_seq_len:
|
||||
logger.warning(
|
||||
"Your specified cache_size is smaller than your "
|
||||
"desired context length. \n"
|
||||
"Defaulting cache_size to max_seq_len."
|
||||
)
|
||||
self.cache_size = self.config.max_seq_len
|
||||
|
||||
# Enable fasttensors loading if present
|
||||
self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False)
|
||||
|
||||
|
|
@ -359,6 +372,7 @@ class ExllamaV2Container:
|
|||
"rope_scale": self.config.scale_pos_emb,
|
||||
"rope_alpha": self.config.scale_alpha_value,
|
||||
"max_seq_len": self.config.max_seq_len,
|
||||
"cache_size": self.cache_size,
|
||||
"cache_mode": self.cache_mode,
|
||||
"chunk_size": self.config.max_input_len,
|
||||
"num_experts_per_token": self.config.num_experts_per_token,
|
||||
|
|
@ -484,7 +498,11 @@ class ExllamaV2Container:
|
|||
if not self.quiet:
|
||||
logger.info("Loading draft model: " + self.draft_config.model_dir)
|
||||
|
||||
self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True)
|
||||
self.draft_cache = ExLlamaV2Cache(
|
||||
self.draft_model,
|
||||
max_seq_len=self.cache_size,
|
||||
lazy=True,
|
||||
)
|
||||
for value in self.draft_model.load_autosplit_gen(
|
||||
self.draft_cache,
|
||||
reserve_vram=autosplit_reserve,
|
||||
|
|
@ -516,15 +534,24 @@ class ExllamaV2Container:
|
|||
|
||||
if self.cache_mode == "Q4":
|
||||
self.cache = ExLlamaV2Cache_Q4(
|
||||
self.model, lazy=self.gpu_split_auto, batch_size=1
|
||||
self.model,
|
||||
max_seq_len=self.cache_size,
|
||||
lazy=self.gpu_split_auto,
|
||||
batch_size=1,
|
||||
)
|
||||
elif self.cache_mode == "FP8":
|
||||
self.cache = ExLlamaV2Cache_8bit(
|
||||
self.model, lazy=self.gpu_split_auto, batch_size=1
|
||||
self.model,
|
||||
max_seq_len=self.cache_size,
|
||||
lazy=self.gpu_split_auto,
|
||||
batch_size=1,
|
||||
)
|
||||
else:
|
||||
self.cache = ExLlamaV2Cache(
|
||||
self.model, lazy=self.gpu_split_auto, batch_size=1
|
||||
self.model,
|
||||
max_seq_len=self.cache_size,
|
||||
lazy=self.gpu_split_auto,
|
||||
batch_size=1,
|
||||
)
|
||||
|
||||
# Load model with autosplit
|
||||
|
|
|
|||
|
|
@ -82,6 +82,11 @@ def add_model_args(parser: argparse.ArgumentParser):
|
|||
type=str_to_bool,
|
||||
help="Overrides base model context length",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--cache-size",
|
||||
type=int,
|
||||
help="The size of the prompt cache (in number of tokens) to allocate"
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--rope-scale", type=float, help="Sets rope_scale or compress_pos_emb"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -77,6 +77,12 @@ model:
|
|||
# Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
|
||||
#override_base_seq_len:
|
||||
|
||||
# Size of the prompt cache to allocate (in number of tokens, must be a multiple of 256)
|
||||
# Larger cache uses more VRAM, but allows for more prompts to be cached and a larger batch of gens to proceed simultanously
|
||||
# The minimum size is max_seq_len, but we recommend setting this to the highest value that will fit on your GPU
|
||||
# Recommend setting this to at least max_seq_len * 2 if you want to use CFG with full-length positive and negative prompts
|
||||
#cache_size:
|
||||
|
||||
# Automatically allocate resources to GPUs (default: True)
|
||||
# NOTE: Not parsed for single GPU users
|
||||
#gpu_split_auto: True
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ class ModelCardParameters(BaseModel):
|
|||
max_seq_len: Optional[int] = None
|
||||
rope_scale: Optional[float] = 1.0
|
||||
rope_alpha: Optional[float] = 1.0
|
||||
cache_size: Optional[int] = None
|
||||
cache_mode: Optional[str] = "FP16"
|
||||
chunk_size: Optional[int] = 2048
|
||||
prompt_template: Optional[str] = None
|
||||
|
|
@ -72,6 +73,13 @@ class ModelLoadRequest(BaseModel):
|
|||
default=None,
|
||||
examples=[4096],
|
||||
)
|
||||
cache_size: Optional[int] = Field(
|
||||
description=(
|
||||
"Number in tokens, must be greater than or equal to max_seq_len"
|
||||
),
|
||||
default=None,
|
||||
examples=[4096],
|
||||
)
|
||||
gpu_split_auto: Optional[bool] = True
|
||||
autosplit_reserve: Optional[List[float]] = [96]
|
||||
gpu_split: Optional[List[float]] = Field(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue