diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 29caa34..fd23a28 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -235,11 +235,10 @@ class ExllamaV2Container(BaseModelContainer):
 
         # Grab the base model's sequence length before overrides for
         # rope calculations
-        base_seq_len = self.config.max_seq_len
+        base_seq_len = hf_model.hf_config.max_position_embeddings
 
         # Set the target seq len if present
-        # Fallback to base_seq_len if not provided
-        target_seq_len = unwrap(kwargs.get("max_seq_len"), base_seq_len)
+        target_seq_len = unwrap(kwargs.get("max_seq_len"), 4096)
 
         # Set the rope scale
         self.config.scale_pos_emb = unwrap(
@@ -247,6 +246,7 @@ class ExllamaV2Container(BaseModelContainer):
         )
 
         # Sets rope alpha value.
+        # Utilize the model's max_position_embeddings as a base value
         # Automatically calculate if unset or defined as an "auto" literal.
         rope_alpha = unwrap(kwargs.get("rope_alpha"), "auto")
         if rope_alpha == "auto":
@@ -371,7 +371,7 @@ class ExllamaV2Container(BaseModelContainer):
             )
 
             # Set draft rope alpha. Follows same behavior as model rope alpha.
-            # Use the base sequence length of the model
+            # Use the max_position_embeddings of the model
             draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto")
             if draft_rope_alpha == "auto":
                 self.draft_config.scale_alpha_value = calculate_rope_alpha(
@@ -399,6 +399,8 @@ class ExllamaV2Container(BaseModelContainer):
                 self.draft_config.max_input_len = chunk_size
                 self.draft_config.max_attention_size = chunk_size**2
 
+        print(self.config.max_seq_len)
+
         # Return the created instance
         return self
 
diff --git a/common/config_models.py b/common/config_models.py
index 0958a8e..46a7b5e 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -175,10 +175,10 @@ class ModelConfig(BaseConfigModel):
     max_seq_len: Optional[int] = Field(
         None,
         description=(
-            "Max sequence length (default: Empty).\n"
-            "Fetched from the model's base sequence length in config.json by default."
+            "Max sequence length (default: 4096).\n"
+            "Set to -1 to fetch from the model's config.json"
         ),
-        ge=0,
+        ge=-1,
     )
     tensor_parallel: Optional[bool] = Field(
         False,
diff --git a/common/model.py b/common/model.py
index 1935025..6534951 100644
--- a/common/model.py
+++ b/common/model.py
@@ -95,7 +95,7 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs):
         else:
             logger.warning(
                 "Cannot find inline model overrides. "
-                "Make sure they are nested under a \"model:\" key"
+                'Make sure they are nested under a "model:" key'
             )
 
         # Merge draft overrides beforehand
@@ -148,6 +148,13 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
     # Fetch the extra HF configuration options
     hf_model = await HFModel.from_directory(model_path)
 
+    # Override the max sequence length based on user
+    max_seq_len = kwargs.get("max_seq_len")
+    if max_seq_len == -1:
+        kwargs["max_seq_len"] = hf_model.hf_config.max_position_embeddings
+    elif max_seq_len is None:
+        kwargs["max_seq_len"] = 4096
+
     # Create a new container and check if the right dependencies are installed
     backend = unwrap(kwargs.get("backend"), detect_backend(hf_model))
     container_class = _BACKEND_REGISTRY.get(backend)
diff --git a/common/transformers_utils.py b/common/transformers_utils.py
index 6cac3b8..a7b0f0c 100644
--- a/common/transformers_utils.py
+++ b/common/transformers_utils.py
@@ -39,12 +39,11 @@ class GenerationConfig(BaseModel):
 
 class HuggingFaceConfig(BaseModel):
     """
-    DEPRECATED: Currently a stub and doesn't do anything.
-
     An abridged version of HuggingFace's model config.
     Will be expanded as needed.
     """
 
+    max_position_embeddings: int = 4096
     eos_token_id: Optional[Union[int, List[int]]] = None
     quantization_config: Optional[Dict] = None
 
diff --git a/config_sample.yml b/config_sample.yml
index ffe2605..1ee82a9 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -78,8 +78,8 @@ model:
   # Options: exllamav2, exllamav3
   backend:
 
-  # Max sequence length (default: Empty).
-  # Fetched from the model's base sequence length in config.json by default.
+  # Max sequence length (default: 4096).
+  # Set to -1 to fetch from the model's config.json
   max_seq_len:
 
   # Load model with tensor parallelism.