Model: Properly pass in max_batch_size from config

The override wasn't being passed in before. Also, the default is now none since Exl2 can automatically calculate the max batch size. Signed-off-by: kingbri <bdashore3@proton.me>
2024-07-30 18:42:25 -04:00 · 2024-07-30 18:42:25 -04:00 · 46304ce875
commit 46304ce875
parent d85414738d
2 changed files with 7 additions and 4 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@ -70,7 +70,7 @@ class ExllamaV2Container:
    cache_size: int = None
    cache_mode: str = "FP16"
    draft_cache_mode: str = "FP16"
-    max_batch_size: int = 20
+    max_batch_size: Optional[int] = None
    generation_config: Optional[GenerationConfig] = None
    hf_config: Optional[HuggingFaceConfig] = None

@ -217,6 +217,9 @@ class ExllamaV2Container:
        # Enable fasttensors loading if present
        self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False)

+        # Set max batch size to the config override
+        self.max_batch_size = unwrap(kwargs.get("max_batch_size"))
+
        # Check whether the user's configuration supports flash/paged attention
        # Also check if exl2 has disabled flash attention
        if (
--- a/config_sample.yml
+++ b/config_sample.yml
@ -146,11 +146,11 @@ model:
  # NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
  #chunk_size: 2048

-  # Set the maximum amount of prompts to process at one time (batch)
-  # This will be automatically adjusted depending on the cache size.
+  # Set the maximum amount of prompts to process at one time (default: None/Automatic)
+  # This will be automatically calculated if left blank.
  # A max batch size of 1 processes prompts one at a time.
  # NOTE: Only available for Nvidia ampere (30 series) and above GPUs
-  #max_batch_size: 20
+  #max_batch_size:

  # Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
  # If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name