From 113643c0df73a52685c7fc54768307c6e06a051b Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Sun, 27 Jul 2025 22:29:46 -0400 Subject: [PATCH] Main: Enable cudaMallocAsync backend by default Works on cuda 12.4 and up. If CUDA doesn't exist, then don't enable the backend. This is an env var that needs to be set, so it's not really possible to set it via config.yml. This used to be experimental, but it's probably fine to keep it enabled since it only provides a benefit. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- docs/02.-Server-options.md | 1 - main.py | 11 +++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md index d97c7bd..98cee55 100644 --- a/docs/02.-Server-options.md +++ b/docs/02.-Server-options.md @@ -47,7 +47,6 @@ Note: These are experimental flags that may be removed at any point. | ------------------------- | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | unsafe_launch | Bool (False) | Skips dependency checks on startup. Only recommended for debugging. | | disable_request_streaming | Bool (False) | Forcefully disables streaming requests | -| cuda_malloc_backend | Bool (False) | Uses pytorch's CUDA malloc backend to load models. Helps save VRAM.

Safe to enable. | | realtime_process_priority | Bool (False) | Set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. | ### Model Options diff --git a/main.py b/main.py index eed45df..7bb2450 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,12 @@ """The main tabbyAPI module. Contains the FastAPI server and endpoints.""" +# Set this env var for cuda malloc async before torch is initalized +import os + +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" + import argparse import asyncio -import os import pathlib import platform import signal @@ -154,11 +158,6 @@ def entrypoint( raise SystemExit(install_message) - # Enable CUDA malloc backend - if config.developer.cuda_malloc_backend: - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" - logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.") - # Set the process priority if config.developer.realtime_process_priority: import psutil