From 46304ce875a6814c68b3cd45f617bc45196e49af Mon Sep 17 00:00:00 2001 From: kingbri Date: Tue, 30 Jul 2024 18:42:25 -0400 Subject: [PATCH] Model: Properly pass in max_batch_size from config The override wasn't being passed in before. Also, the default is now none since Exl2 can automatically calculate the max batch size. Signed-off-by: kingbri --- backends/exllamav2/model.py | 5 ++++- config_sample.yml | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 3df16b0..c7c032a 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -70,7 +70,7 @@ class ExllamaV2Container: cache_size: int = None cache_mode: str = "FP16" draft_cache_mode: str = "FP16" - max_batch_size: int = 20 + max_batch_size: Optional[int] = None generation_config: Optional[GenerationConfig] = None hf_config: Optional[HuggingFaceConfig] = None @@ -217,6 +217,9 @@ class ExllamaV2Container: # Enable fasttensors loading if present self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False) + # Set max batch size to the config override + self.max_batch_size = unwrap(kwargs.get("max_batch_size")) + # Check whether the user's configuration supports flash/paged attention # Also check if exl2 has disabled flash attention if ( diff --git a/config_sample.yml b/config_sample.yml index c92f673..e57d947 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -146,11 +146,11 @@ model: # NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096 #chunk_size: 2048 - # Set the maximum amount of prompts to process at one time (batch) - # This will be automatically adjusted depending on the cache size. + # Set the maximum amount of prompts to process at one time (default: None/Automatic) + # This will be automatically calculated if left blank. # A max batch size of 1 processes prompts one at a time. # NOTE: Only available for Nvidia ampere (30 series) and above GPUs - #max_batch_size: 20 + #max_batch_size: # Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None) # If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name