From 5c082b7e8c4570ceff108191b53b5497f7c00cf5 Mon Sep 17 00:00:00 2001 From: kingbri Date: Wed, 24 Jul 2024 18:56:28 -0400 Subject: [PATCH] Async: Add option to use Uvloop/Winloop These are faster event loops for asyncio which should improve overall performance. Gate these under an experimental flag for now to stress test these loops. Signed-off-by: kingbri --- common/args.py | 7 ++- config_sample.yml | 5 ++ endpoints/server.py | 5 ++ main.py | 110 +++++++++++++++++++++++++------------------- pyproject.toml | 4 ++ start.py | 4 +- 6 files changed, 85 insertions(+), 50 deletions(-) diff --git a/common/args.py b/common/args.py index bbf007c..e57de78 100644 --- a/common/args.py +++ b/common/args.py @@ -193,7 +193,12 @@ def add_developer_args(parser: argparse.ArgumentParser): developer_group.add_argument( "--cuda-malloc-backend", type=str_to_bool, - help="Disables API request streaming", + help="Runs with the pytorch CUDA malloc backend", + ) + developer_group.add_argument( + "--uvloop", + type=str_to_bool, + help="Run asyncio using Uvloop or Winloop", ) diff --git a/config_sample.yml b/config_sample.yml index c5d6c9c..3070642 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -62,6 +62,11 @@ developer: # This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk. #cuda_malloc_backend: False + # Enable Uvloop or Winloop (default: False) + # Make the program utilize a faster async event loop which can improve performance + # NOTE: It's recommended to enable this, but if something breaks, turn this off. + #uvloop: False + # Options for model overrides and loading # Please read the comments to understand how arguments are handled between initial and API loads model: diff --git a/endpoints/server.py b/endpoints/server.py index 4fc3f0b..401b211 100644 --- a/endpoints/server.py +++ b/endpoints/server.py @@ -1,3 +1,4 @@ +import asyncio import uvicorn from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware @@ -71,11 +72,15 @@ async def start_api(host: str, port: int): # Setup app app = setup_app() + # Get the current event loop + loop = asyncio.get_running_loop() + config = uvicorn.Config( app, host=host, port=port, log_config=UVICORN_LOG_CONFIG, + loop=loop, ) server = uvicorn.Server(config) diff --git a/main.py b/main.py index fd29297..2ec98db 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,10 @@ """The main tabbyAPI module. Contains the FastAPI server and endpoints.""" import asyncio -import aiofiles import json import os import pathlib +import platform import signal from loguru import logger from typing import Optional @@ -23,51 +23,8 @@ if not do_export_openapi: from backends.exllamav2.utils import check_exllama_version -async def entrypoint(args: Optional[dict] = None): - """Entry function for program startup""" - - setup_logger() - - # Set up signal aborting - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - if os.getenv("EXPORT_OPENAPI", "").lower() in ("true", "1"): - openapi_json = export_openapi() - - async with aiofiles.open("openapi.json", "w") as f: - await f.write(json.dumps(openapi_json)) - logger.info("Successfully wrote OpenAPI spec to openapi.json") - - return - - # Load from YAML config - config.from_file(pathlib.Path("config.yml")) - - # Parse and override config from args - if args is None: - parser = init_argparser() - args = convert_args_to_dict(parser.parse_args(), parser) - - config.from_args(args) - - developer_config = config.developer_config() - - # Check exllamav2 version and give a descriptive error if it's too old - # Skip if launching unsafely - - if unwrap(developer_config.get("unsafe_launch"), False): - logger.warning( - "UNSAFE: Skipping ExllamaV2 version check.\n" - "If you aren't a developer, please keep this off!" - ) - else: - check_exllama_version() - - # Enable CUDA malloc backend - if unwrap(developer_config.get("cuda_malloc_backend"), False): - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" - logger.warning("Enabled the experimental CUDA malloc backend.") +async def entrypoint_async(): + """Async entry function for program startup""" network_config = config.network_config() @@ -131,5 +88,64 @@ async def entrypoint(args: Optional[dict] = None): await start_api(host, port) +def entrypoint(arguments: Optional[dict] = None): + setup_logger() + + # Set up signal aborting + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + if do_export_openapi: + openapi_json = export_openapi() + + with open("openapi.json", "w") as f: + f.write(json.dumps(openapi_json)) + logger.info("Successfully wrote OpenAPI spec to openapi.json") + + return + + # Load from YAML config + config.from_file(pathlib.Path("config.yml")) + + # Parse and override config from args + if arguments is None: + parser = init_argparser() + arguments = convert_args_to_dict(parser.parse_args(), parser) + + config.from_args(arguments) + developer_config = config.developer_config() + + # Check exllamav2 version and give a descriptive error if it's too old + # Skip if launching unsafely + + if unwrap(developer_config.get("unsafe_launch"), False): + logger.warning( + "UNSAFE: Skipping ExllamaV2 version check.\n" + "If you aren't a developer, please keep this off!" + ) + else: + check_exllama_version() + + # Enable CUDA malloc backend + if unwrap(developer_config.get("cuda_malloc_backend"), False): + os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" + logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.") + + # Use Uvloop/Winloop + if unwrap(developer_config.get("uvloop"), False): + if platform.system() == "Windows": + from winloop import install + else: + from uvloop import install + + # Set loop event policy + install() + + logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.") + + # Enter into the async event loop + asyncio.run(entrypoint_async()) + + if __name__ == "__main__": - asyncio.run(entrypoint()) + entrypoint() diff --git a/pyproject.toml b/pyproject.toml index ecf6362..031e228 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,10 @@ dependencies = [ "lm-format-enforcer >= 0.9.6", "aiofiles", + # Improved asyncio loops + "uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'", + "winloop ; platform_system == 'Windows'", + # TEMP: Remove once 2.x is fixed in upstream "numpy < 2.0.0", diff --git a/start.py b/start.py index d1f8843..ddf4d27 100644 --- a/start.py +++ b/start.py @@ -1,6 +1,5 @@ """Utility to automatically upgrade and start the API""" -import asyncio import argparse import os import pathlib @@ -159,4 +158,5 @@ if __name__ == "__main__": # Import entrypoint after installing all requirements from main import entrypoint - asyncio.run(entrypoint(convert_args_to_dict(args, parser))) + converted_args = convert_args_to_dict(args, parser) + entrypoint(converted_args)