Merge pull request #111 from DocShotgun/main

Add support for specifying k/v cache size
This commit is contained in:
Brian Dashore 2024-05-26 20:52:21 -04:00 committed by GitHub
commit 3dcae8b023
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 49 additions and 5 deletions

View file

@ -58,6 +58,7 @@ class ExllamaV2Container:
paged: bool = True
# Internal config vars
cache_size: int = None
cache_mode: str = "FP16"
max_batch_size: int = 20
generation_config: Optional[GenerationConfig] = None
@ -91,9 +92,11 @@ class ExllamaV2Container:
loading_draft: bool)
**kwargs:
`cache_mode` (str): Sets cache mode, "FP16" or "FP8"
(defaulf: "FP16")
(default: "FP16")
'max_seq_len' (int): Override model's default max sequence
length (default: 4096)
'cache_size' (int): Num of tokens to allocate space for in the k/v cache
(default: max_seq_len)
'rope_scale' (float): Set RoPE scaling factor for model
(default: 1.0)
'rope_alpha' (float): Set RoPE alpha (NTK) factor for model
@ -189,6 +192,16 @@ class ExllamaV2Container:
kwargs.get("rope_alpha"), self.calculate_rope_alpha(base_seq_len)
)
# Set k/v cache size
self.cache_size = unwrap(kwargs.get("cache_size"), self.config.max_seq_len)
if self.cache_size < self.config.max_seq_len:
logger.warning(
"Your specified cache_size is smaller than your "
"desired context length. \n"
"Defaulting cache_size to max_seq_len."
)
self.cache_size = self.config.max_seq_len
# Enable fasttensors loading if present
self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False)
@ -359,6 +372,7 @@ class ExllamaV2Container:
"rope_scale": self.config.scale_pos_emb,
"rope_alpha": self.config.scale_alpha_value,
"max_seq_len": self.config.max_seq_len,
"cache_size": self.cache_size,
"cache_mode": self.cache_mode,
"chunk_size": self.config.max_input_len,
"num_experts_per_token": self.config.num_experts_per_token,
@ -484,7 +498,11 @@ class ExllamaV2Container:
if not self.quiet:
logger.info("Loading draft model: " + self.draft_config.model_dir)
self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True)
self.draft_cache = ExLlamaV2Cache(
self.draft_model,
max_seq_len=self.cache_size,
lazy=True,
)
for value in self.draft_model.load_autosplit_gen(
self.draft_cache,
reserve_vram=autosplit_reserve,
@ -516,15 +534,24 @@ class ExllamaV2Container:
if self.cache_mode == "Q4":
self.cache = ExLlamaV2Cache_Q4(
self.model, lazy=self.gpu_split_auto, batch_size=1
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
elif self.cache_mode == "FP8":
self.cache = ExLlamaV2Cache_8bit(
self.model, lazy=self.gpu_split_auto, batch_size=1
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
else:
self.cache = ExLlamaV2Cache(
self.model, lazy=self.gpu_split_auto, batch_size=1
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
# Load model with autosplit

View file

@ -82,6 +82,11 @@ def add_model_args(parser: argparse.ArgumentParser):
type=str_to_bool,
help="Overrides base model context length",
)
model_group.add_argument(
"--cache-size",
type=int,
help="The size of the prompt cache (in number of tokens) to allocate",
)
model_group.add_argument(
"--rope-scale", type=float, help="Sets rope_scale or compress_pos_emb"
)

View file

@ -104,6 +104,12 @@ model:
# Possible values FP16, FP8, Q4. (default: FP16)
#cache_mode: FP16
# Size of the prompt cache to allocate (default: max_seq_len)
# This must be a multiple of 256. A larger cache uses more VRAM, but allows for more prompts to be processed at once.
# NOTE: Cache size should not be less than max_seq_len.
# For CFG, set this to 2 * max_seq_len to make room for both positive and negative prompts.
#cache_size:
# Chunk size for prompt ingestion. A lower value reduces VRAM usage at the cost of ingestion speed (default: 2048)
# NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
#chunk_size: 2048

View file

@ -15,6 +15,7 @@ class ModelCardParameters(BaseModel):
max_seq_len: Optional[int] = None
rope_scale: Optional[float] = 1.0
rope_alpha: Optional[float] = 1.0
cache_size: Optional[int] = None
cache_mode: Optional[str] = "FP16"
chunk_size: Optional[int] = 2048
prompt_template: Optional[str] = None
@ -72,6 +73,11 @@ class ModelLoadRequest(BaseModel):
default=None,
examples=[4096],
)
cache_size: Optional[int] = Field(
description=("Number in tokens, must be greater than or equal to max_seq_len"),
default=None,
examples=[4096],
)
gpu_split_auto: Optional[bool] = True
autosplit_reserve: Optional[List[float]] = [96]
gpu_split: Optional[List[float]] = Field(