diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 27fe33f..5d79e4f 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -185,6 +185,7 @@ class ExllamaV2Container(BaseModelContainer):
         # MARK: User configuration
 
         # Get cache mode
+        # TODO: Separate validation for Exl2 and Exl3 q-cache options
         self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
 
         # Turn off GPU split if the user is using 1 GPU
diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index fc10a3d..18e04e8 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -1,6 +1,7 @@
 import asyncio
 import gc
 import pathlib
+import re
 import traceback
 from typing import (
     Any,
@@ -19,6 +20,7 @@ from exllamav3 import (
     Model,
     Tokenizer,
 )
+from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
 from loguru import logger
 
 from backends.base_model_container import BaseModelContainer
@@ -73,6 +75,7 @@ class ExllamaV3Container(BaseModelContainer):
     use_tp: bool = False
     max_seq_len: int = 4096
     cache_size: int = 4096
+    cache_mode: str = "FP16"
     chunk_size: int = 2048
     max_batch_size: Optional[int] = None
 
@@ -219,7 +222,32 @@ class ExllamaV3Container(BaseModelContainer):
         # Cache
         user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len)
         self.cache_size = self.adjust_cache_size(user_cache_size)
-        self.cache = Cache(self.model, max_num_tokens=self.cache_size)
+        self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
+
+        # Alias Exl2 q-cache settings
+        match self.cache_mode:
+            case "Q4":
+                self.cache_mode = "4,4"
+            case "Q6":
+                self.cache_mode = "6,6"
+            case "Q8":
+                self.cache_mode = "8,8"
+
+        split_cache_mode = re.search(r"^([2-8]),([2-8])$", self.cache_mode)
+        if split_cache_mode:
+            k_bits = int(split_cache_mode.group(1))
+            v_bits = int(split_cache_mode.group(2))
+            self.cache = Cache(
+                self.model,
+                max_num_tokens=self.cache_size,
+                layer_type=CacheLayer_quant,
+                k_bits=k_bits,
+                v_bits=v_bits,
+            )
+        else:
+            self.cache = Cache(
+                self.model, max_num_tokens=self.cache_size, layer_type=CacheLayer_fp16
+            )
 
         # Draft cache
         if self.use_draft_model:
@@ -314,7 +342,7 @@ class ExllamaV3Container(BaseModelContainer):
             max_seq_len=self.max_seq_len,
             cache_size=self.cache_size,
             max_batch_size=self.max_batch_size,
-            # cache_mode=self.cache_mode,
+            cache_mode=self.cache_mode,
             chunk_size=self.chunk_size,
             use_vision=self.use_vision,
         )
diff --git a/common/config_models.py b/common/config_models.py
index 2965fe0..8ee4ff7 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -1,6 +1,7 @@
 from pydantic import (
     BaseModel,
     ConfigDict,
+    constr,
     Field,
     PrivateAttr,
     field_validator,
@@ -9,6 +10,7 @@ from typing import List, Literal, Optional, Union
 
 
 CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
+CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8],[2-8]$")]
 
 
 class Metadata(BaseModel):
@@ -227,11 +229,14 @@ class ModelConfig(BaseConfigModel):
             "or auto-calculate."
         ),
     )
-    cache_mode: Optional[CACHE_SIZES] = Field(
+    # TODO: Separate validation for Exl2 and Exl3 q-cache options
+    cache_mode: Optional[CACHE_TYPE] = Field(
         "FP16",
         description=(
             "Enable different cache modes for VRAM savings (default: FP16).\n"
-            f"Possible values: {str(CACHE_SIZES)[15:-1]}."
+            f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
+            "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
+            "are integers from 2-8 (i.e. 8,8)."
         ),
     )
     cache_size: Optional[int] = Field(
diff --git a/config_sample.yml b/config_sample.yml
index a743c2c..045db51 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -114,7 +114,8 @@ model:
   rope_alpha:
 
   # Enable different cache modes for VRAM savings (default: FP16).
-  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
   cache_mode: FP16
 
   # Size of the prompt cache to allocate (default: max_seq_len).
@@ -164,7 +165,8 @@ draft_model:
   draft_rope_alpha:
 
   # Cache mode for draft models to save VRAM (default: FP16).
-  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
   draft_cache_mode: FP16
 
   # An integer array of GBs of VRAM to split between GPUs (default: []).