diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index 6bec250..793ec35 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -1,5 +1,6 @@
 import asyncio
 import gc
+import math
 import pathlib
 from loguru import logger
 from typing import (
@@ -46,8 +47,11 @@ class ExllamaV3Container(BaseModelContainer):
     cache: Cache
     tokenizer: Tokenizer
     config: Config
-    gpu_split: List[float] = []
+    gpu_split: List[float] | None = None
+    gpu_split_auto: bool = True
+    autosplit_reserve: List[float] = [96 * 1024**2]
     max_seq_len: int
+    use_tp: bool = False
 
     # Required methods
     @classmethod
@@ -76,17 +80,43 @@ class ExllamaV3Container(BaseModelContainer):
 
         self.max_seq_len = kwargs.get("max_seq_len")
         self.cache = Cache(self.model, max_num_tokens=self.max_seq_len)
-        gpu_split = unwrap(kwargs.get("gpu_split"), [])
 
-        # Set GPU split options
-        # Enable manual GPU split if provided
-        if gpu_split:
-            self.gpu_split = gpu_split
         # Try to set prompt template
         self.prompt_template = await find_prompt_template(
             kwargs.get("prompt_template"), model_directory
         )
 
+        # Turn off GPU split if the user is using 1 GPU
+        gpu_count = torch.cuda.device_count()
+        gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
+        gpu_split = unwrap(kwargs.get("gpu_split"), None)
+
+        # Set GPU split options
+        if gpu_count == 1:
+            self.gpu_split_auto = False
+            logger.info("Disabling GPU split because one GPU is in use.")
+        else:
+            # TODO: Set tensor parallel
+
+            # Set GPU split options
+            # Enable manual GPU split if provided
+            if gpu_split:
+                self.gpu_split = gpu_split
+            elif gpu_split_auto and not self.use_tp:
+                # Otherwise fallback to autosplit settings
+                self.gpu_split_auto = gpu_split_auto
+
+                autosplit_reserve_megabytes = unwrap(
+                    kwargs.get("autosplit_reserve"), [96]
+                )
+
+                # Reserve VRAM for each GPU
+                self.autosplit_reserve = [
+                    int(math.ceil(value * 1024**2))
+                    for value in autosplit_reserve_megabytes
+                ]
+        # TODO: speculative decoding
+
         return self
 
     async def load(self, progress_callback=None, **kwargs):