Model: Properly pass in max_batch_size from config
The override wasn't being passed in before. Also, the default is now none since Exl2 can automatically calculate the max batch size. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
parent
d85414738d
commit
46304ce875
2 changed files with 7 additions and 4 deletions
|
|
@ -70,7 +70,7 @@ class ExllamaV2Container:
|
|||
cache_size: int = None
|
||||
cache_mode: str = "FP16"
|
||||
draft_cache_mode: str = "FP16"
|
||||
max_batch_size: int = 20
|
||||
max_batch_size: Optional[int] = None
|
||||
generation_config: Optional[GenerationConfig] = None
|
||||
hf_config: Optional[HuggingFaceConfig] = None
|
||||
|
||||
|
|
@ -217,6 +217,9 @@ class ExllamaV2Container:
|
|||
# Enable fasttensors loading if present
|
||||
self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False)
|
||||
|
||||
# Set max batch size to the config override
|
||||
self.max_batch_size = unwrap(kwargs.get("max_batch_size"))
|
||||
|
||||
# Check whether the user's configuration supports flash/paged attention
|
||||
# Also check if exl2 has disabled flash attention
|
||||
if (
|
||||
|
|
|
|||
|
|
@ -146,11 +146,11 @@ model:
|
|||
# NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
|
||||
#chunk_size: 2048
|
||||
|
||||
# Set the maximum amount of prompts to process at one time (batch)
|
||||
# This will be automatically adjusted depending on the cache size.
|
||||
# Set the maximum amount of prompts to process at one time (default: None/Automatic)
|
||||
# This will be automatically calculated if left blank.
|
||||
# A max batch size of 1 processes prompts one at a time.
|
||||
# NOTE: Only available for Nvidia ampere (30 series) and above GPUs
|
||||
#max_batch_size: 20
|
||||
#max_batch_size:
|
||||
|
||||
# Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
|
||||
# If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue