diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md index d97c7bd..98cee55 100644 --- a/docs/02.-Server-options.md +++ b/docs/02.-Server-options.md @@ -47,7 +47,6 @@ Note: These are experimental flags that may be removed at any point. | ------------------------- | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | unsafe_launch | Bool (False) | Skips dependency checks on startup. Only recommended for debugging. | | disable_request_streaming | Bool (False) | Forcefully disables streaming requests | -| cuda_malloc_backend | Bool (False) | Uses pytorch's CUDA malloc backend to load models. Helps save VRAM.

Safe to enable. | | realtime_process_priority | Bool (False) | Set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. | ### Model Options diff --git a/main.py b/main.py index eed45df..7bb2450 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,12 @@ """The main tabbyAPI module. Contains the FastAPI server and endpoints.""" +# Set this env var for cuda malloc async before torch is initalized +import os + +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" + import argparse import asyncio -import os import pathlib import platform import signal @@ -154,11 +158,6 @@ def entrypoint( raise SystemExit(install_message) - # Enable CUDA malloc backend - if config.developer.cuda_malloc_backend: - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" - logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.") - # Set the process priority if config.developer.realtime_process_priority: import psutil