From 46304ce875a6814c68b3cd45f617bc45196e49af Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Tue, 30 Jul 2024 18:42:25 -0400
Subject: [PATCH] Model: Properly pass in max_batch_size from config

The override wasn't being passed in before. Also, the default is now
none since Exl2 can automatically calculate the max batch size.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 backends/exllamav2/model.py | 5 ++++-
 config_sample.yml           | 6 +++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 3df16b0..c7c032a 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -70,7 +70,7 @@ class ExllamaV2Container:
     cache_size: int = None
     cache_mode: str = "FP16"
     draft_cache_mode: str = "FP16"
-    max_batch_size: int = 20
+    max_batch_size: Optional[int] = None
     generation_config: Optional[GenerationConfig] = None
     hf_config: Optional[HuggingFaceConfig] = None
 
@@ -217,6 +217,9 @@ class ExllamaV2Container:
         # Enable fasttensors loading if present
         self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False)
 
+        # Set max batch size to the config override
+        self.max_batch_size = unwrap(kwargs.get("max_batch_size"))
+
         # Check whether the user's configuration supports flash/paged attention
         # Also check if exl2 has disabled flash attention
         if (
diff --git a/config_sample.yml b/config_sample.yml
index c92f673..e57d947 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -146,11 +146,11 @@ model:
   # NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
   #chunk_size: 2048
 
-  # Set the maximum amount of prompts to process at one time (batch)
-  # This will be automatically adjusted depending on the cache size.
+  # Set the maximum amount of prompts to process at one time (default: None/Automatic)
+  # This will be automatically calculated if left blank.
   # A max batch size of 1 processes prompts one at a time.
   # NOTE: Only available for Nvidia ampere (30 series) and above GPUs
-  #max_batch_size: 20
+  #max_batch_size:
 
   # Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
   # If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name