diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md
index d97c7bd..98cee55 100644
--- a/docs/02.-Server-options.md
+++ b/docs/02.-Server-options.md
@@ -47,7 +47,6 @@ Note: These are experimental flags that may be removed at any point.
| ------------------------- | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| unsafe_launch | Bool (False) | Skips dependency checks on startup. Only recommended for debugging. |
| disable_request_streaming | Bool (False) | Forcefully disables streaming requests |
-| cuda_malloc_backend | Bool (False) | Uses pytorch's CUDA malloc backend to load models. Helps save VRAM.
Safe to enable. |
| realtime_process_priority | Bool (False) | Set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. |
### Model Options
diff --git a/main.py b/main.py
index eed45df..7bb2450 100644
--- a/main.py
+++ b/main.py
@@ -1,8 +1,12 @@
"""The main tabbyAPI module. Contains the FastAPI server and endpoints."""
+# Set this env var for cuda malloc async before torch is initalized
+import os
+
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
+
import argparse
import asyncio
-import os
import pathlib
import platform
import signal
@@ -154,11 +158,6 @@ def entrypoint(
raise SystemExit(install_message)
- # Enable CUDA malloc backend
- if config.developer.cuda_malloc_backend:
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
- logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
-
# Set the process priority
if config.developer.realtime_process_priority:
import psutil