Main: Enable cudaMallocAsync backend by default

Works on cuda 12.4 and up. If CUDA doesn't exist, then don't enable the backend. This is an env var that needs to be set, so it's not really possible to set it via config.yml. This used to be experimental, but it's probably fine to keep it enabled since it only provides a benefit. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2025-07-27 22:29:46 -04:00 · 2025-07-27 22:29:46 -04:00 · 113643c0df
commit 113643c0df
parent 0b4ca567f8
2 changed files with 5 additions and 7 deletions
--- a/main.py
+++ b/main.py
@ -1,8 +1,12 @@
 """The main tabbyAPI module. Contains the FastAPI server and endpoints."""

+# Set this env var for cuda malloc async before torch is initalized
+import os
+
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
+
 import argparse
 import asyncio
-import os
 import pathlib
 import platform
 import signal
@ -154,11 +158,6 @@ def entrypoint(

        raise SystemExit(install_message)

-    # Enable CUDA malloc backend
-    if config.developer.cuda_malloc_backend:
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
-        logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
-
    # Set the process priority
    if config.developer.realtime_process_priority:
        import psutil