diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md
index d97c7bd..98cee55 100644
--- a/docs/02.-Server-options.md
+++ b/docs/02.-Server-options.md
@@ -47,7 +47,6 @@ Note: These are experimental flags that may be removed at any point.
 | ------------------------- | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
 | unsafe_launch             | Bool (False)   | Skips dependency checks on startup. Only recommended for debugging.                                                                             |
 | disable_request_streaming | Bool (False)   | Forcefully disables streaming requests                                                                                                          |
-| cuda_malloc_backend       | Bool (False)   | Uses pytorch's CUDA malloc backend to load models. Helps save VRAM.<br><br>Safe to enable.                                                      |
 | realtime_process_priority | Bool (False)   | Set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. |
 
 ### Model Options
diff --git a/main.py b/main.py
index eed45df..7bb2450 100644
--- a/main.py
+++ b/main.py
@@ -1,8 +1,12 @@
 """The main tabbyAPI module. Contains the FastAPI server and endpoints."""
 
+# Set this env var for cuda malloc async before torch is initalized
+import os
+
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
+
 import argparse
 import asyncio
-import os
 import pathlib
 import platform
 import signal
@@ -154,11 +158,6 @@ def entrypoint(
 
         raise SystemExit(install_message)
 
-    # Enable CUDA malloc backend
-    if config.developer.cuda_malloc_backend:
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
-        logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
-
     # Set the process priority
     if config.developer.realtime_process_priority:
         import psutil