diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 647f2fa..fd71ba3 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -3,8 +3,6 @@
 import gc
 import math
 import pathlib
-import threading
-import time
 import traceback
 import torch
 import uuid
@@ -57,10 +55,11 @@ class ExllamaV2Container:
     generator: Optional[ExLlamaV2DynamicGeneratorAsync] = None
     prompt_template: Optional[PromptTemplate] = None
     active_loras: List[ExLlamaV2Lora] = []
+    paged: bool = True
 
     # Internal config vars
     cache_mode: str = "FP16"
-    use_cfg: bool = False
+    max_batch_size: int = 20
     generation_config: Optional[GenerationConfig] = None
 
     # GPU split vars
@@ -115,10 +114,6 @@ class ExllamaV2Container:
                     available devices (default: True)
                 'gpu_split' (list[float]): Allocation for weights and (some)
                     tensors, per device
-                'no_flash_attn' (bool): Turns off flash attention
-                    (increases vram usage) (default: False)
-                'use_cfg" (bool): Enables CFG support. Disables flash attention
-                    (default: False)
         """
 
         self.quiet = quiet
@@ -184,18 +179,9 @@ class ExllamaV2Container:
             kwargs.get("rope_alpha"), self.calculate_rope_alpha(base_seq_len)
         )
 
-        # Enable CFG if present
-        self.use_cfg = unwrap(kwargs.get("use_cfg"), False)
-
         # Enable fasttensors loading if present
         self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False)
 
-        # Turn off flash attention if CFG is on
-        # Workaround until batched FA2 is fixed in exllamav2 upstream
-        # self.config.no_flash_attn = (
-        #     True if self.use_cfg else unwrap(kwargs.get("no_flash_attention"), False)
-        # )
-
         # Try to set prompt template
         self.prompt_template = self.find_prompt_template(
             kwargs.get("prompt_template"), model_directory
@@ -345,7 +331,6 @@ class ExllamaV2Container:
             "cache_mode": self.cache_mode,
             "chunk_size": self.config.max_input_len,
             "num_experts_per_token": self.config.num_experts_per_token,
-            "use_cfg": self.use_cfg,
             "prompt_template": self.prompt_template.name
             if self.prompt_template
             else None,
@@ -420,10 +405,24 @@ class ExllamaV2Container:
         async for value in iterate_in_threadpool(model_load_generator):
             yield value
 
-        # TODO: Change these!
-        # Set the max batch size and check if paged support is available
-        max_batch_size = 1 if self.config.no_flash_attn else 20
-        paged = not self.config.no_flash_attn
+        # Disable paged mode if the user's min GPU is supported (ampere and above)
+        min_compute_capability = min(
+            set(
+                [
+                    torch.cuda.get_device_capability(device=module.device_idx)[0]
+                    for module in self.model.modules
+                    if module.device_idx >= 0
+                ]
+            )
+        )
+
+        if torch.version.hip or min_compute_capability < 8:
+            logger.warning(
+                "An unsupported GPU is found in this configuration. "
+                "Switching to compatibility mode. This disables parallel batching."
+            )
+            self.paged = False
+            self.max_batch_size = 1
 
         # Create async generator
         self.generator = ExLlamaV2DynamicGeneratorAsync(
@@ -432,8 +431,8 @@ class ExllamaV2Container:
             draft_model=self.draft_model,
             draft_cache=self.draft_cache,
             tokenizer=self.tokenizer,
-            max_batch_size=max_batch_size,
-            paged=paged,
+            max_batch_size=self.max_batch_size,
+            paged=self.paged,
         )
 
         # Clean up any extra vram usage from torch and cuda
@@ -741,7 +740,7 @@ class ExllamaV2Container:
         cfg_scale = unwrap(kwargs.get("cfg_scale"), 1.0)
         negative_prompt = None
         if cfg_scale not in [None, 1.0]:
-            if self.use_cfg:
+            if self.paged:
                 gen_settings.cfg_scale = cfg_scale
 
                 # If the negative prompt is empty, use the BOS token
@@ -752,8 +751,8 @@ class ExllamaV2Container:
                 prompts.append(negative_prompt)
             else:
                 logger.warning(
-                    "CFG is currently disabled. "
-                    "If your GPU is supported, reload your model with use_cfg = True"
+                    "CFG is currently disabled because paged mode is disabled. "
+                    "Please use an ampere (30 series) or higher GPU for CFG support."
                 )
 
         gen_settings.token_repetition_penalty = unwrap(
diff --git a/config_sample.yml b/config_sample.yml
index 9cc8337..ead55b5 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -100,9 +100,6 @@ model:
   # Leave blank to automatically calculate alpha
   #rope_alpha: 1.0
 
-  # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)
-  #no_flash_attention: False
-
   # Enable different cache modes for VRAM savings (slight performance hit).
   # Possible values FP16, FP8, Q4. (default: FP16)
   #cache_mode: FP16
@@ -111,6 +108,12 @@ model:
   # NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
   #chunk_size: 2048
 
+  # Set the maximum amount of prompts to process at one time (batch)
+  # This will be automatically adjusted depending on the cache size.
+  # A max batch size of 1 processes prompts one at a time.
+  # NOTE: Only available for Nvidia ampere (30 series) and above GPUs
+  #max_batch_size: 20
+
   # Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
   # If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name
   # of the template you want to use.
@@ -122,10 +125,6 @@ model:
   # NOTE: For MoE models (ex. Mixtral) only!
   #num_experts_per_token:
 
-  # Enables CFG support (default: False)
-  # WARNING: This flag disables Flash Attention! (a stopgap fix until it's fixed in upstream)
-  #use_cfg: False
-
   # Enables fasttensors to possibly increase model loading speeds (default: False)
   #fasttensors: true
 
diff --git a/endpoints/OAI/types/model.py b/endpoints/OAI/types/model.py
index 22895be..41dc68e 100644
--- a/endpoints/OAI/types/model.py
+++ b/endpoints/OAI/types/model.py
@@ -19,7 +19,6 @@ class ModelCardParameters(BaseModel):
     chunk_size: Optional[int] = 2048
     prompt_template: Optional[str] = None
     num_experts_per_token: Optional[int] = None
-    use_cfg: Optional[bool] = None
 
     # Draft is another model, so include it in the card params
     draft: Optional["ModelCard"] = None
@@ -94,7 +93,6 @@ class ModelLoadRequest(BaseModel):
     chunk_size: Optional[int] = 2048
     prompt_template: Optional[str] = None
     num_experts_per_token: Optional[int] = None
-    use_cfg: Optional[bool] = None
     fasttensors: Optional[bool] = False
     draft: Optional[DraftModelLoadRequest] = None
     skip_queue: Optional[bool] = False