Async: Add option to use Uvloop/Winloop

These are faster event loops for asyncio which should improve overall
performance. Gate these under an experimental flag for now to stress
test these loops.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri 2024-07-24 18:56:28 -04:00
parent 71de3060bb
commit 5c082b7e8c
6 changed files with 85 additions and 50 deletions

View file

@ -193,7 +193,12 @@ def add_developer_args(parser: argparse.ArgumentParser):
developer_group.add_argument(
"--cuda-malloc-backend",
type=str_to_bool,
help="Disables API request streaming",
help="Runs with the pytorch CUDA malloc backend",
)
developer_group.add_argument(
"--uvloop",
type=str_to_bool,
help="Run asyncio using Uvloop or Winloop",
)

View file

@ -62,6 +62,11 @@ developer:
# This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
#cuda_malloc_backend: False
# Enable Uvloop or Winloop (default: False)
# Make the program utilize a faster async event loop which can improve performance
# NOTE: It's recommended to enable this, but if something breaks, turn this off.
#uvloop: False
# Options for model overrides and loading
# Please read the comments to understand how arguments are handled between initial and API loads
model:

View file

@ -1,3 +1,4 @@
import asyncio
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
@ -71,11 +72,15 @@ async def start_api(host: str, port: int):
# Setup app
app = setup_app()
# Get the current event loop
loop = asyncio.get_running_loop()
config = uvicorn.Config(
app,
host=host,
port=port,
log_config=UVICORN_LOG_CONFIG,
loop=loop,
)
server = uvicorn.Server(config)

110
main.py
View file

@ -1,10 +1,10 @@
"""The main tabbyAPI module. Contains the FastAPI server and endpoints."""
import asyncio
import aiofiles
import json
import os
import pathlib
import platform
import signal
from loguru import logger
from typing import Optional
@ -23,51 +23,8 @@ if not do_export_openapi:
from backends.exllamav2.utils import check_exllama_version
async def entrypoint(args: Optional[dict] = None):
"""Entry function for program startup"""
setup_logger()
# Set up signal aborting
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
if os.getenv("EXPORT_OPENAPI", "").lower() in ("true", "1"):
openapi_json = export_openapi()
async with aiofiles.open("openapi.json", "w") as f:
await f.write(json.dumps(openapi_json))
logger.info("Successfully wrote OpenAPI spec to openapi.json")
return
# Load from YAML config
config.from_file(pathlib.Path("config.yml"))
# Parse and override config from args
if args is None:
parser = init_argparser()
args = convert_args_to_dict(parser.parse_args(), parser)
config.from_args(args)
developer_config = config.developer_config()
# Check exllamav2 version and give a descriptive error if it's too old
# Skip if launching unsafely
if unwrap(developer_config.get("unsafe_launch"), False):
logger.warning(
"UNSAFE: Skipping ExllamaV2 version check.\n"
"If you aren't a developer, please keep this off!"
)
else:
check_exllama_version()
# Enable CUDA malloc backend
if unwrap(developer_config.get("cuda_malloc_backend"), False):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
logger.warning("Enabled the experimental CUDA malloc backend.")
async def entrypoint_async():
"""Async entry function for program startup"""
network_config = config.network_config()
@ -131,5 +88,64 @@ async def entrypoint(args: Optional[dict] = None):
await start_api(host, port)
def entrypoint(arguments: Optional[dict] = None):
setup_logger()
# Set up signal aborting
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
if do_export_openapi:
openapi_json = export_openapi()
with open("openapi.json", "w") as f:
f.write(json.dumps(openapi_json))
logger.info("Successfully wrote OpenAPI spec to openapi.json")
return
# Load from YAML config
config.from_file(pathlib.Path("config.yml"))
# Parse and override config from args
if arguments is None:
parser = init_argparser()
arguments = convert_args_to_dict(parser.parse_args(), parser)
config.from_args(arguments)
developer_config = config.developer_config()
# Check exllamav2 version and give a descriptive error if it's too old
# Skip if launching unsafely
if unwrap(developer_config.get("unsafe_launch"), False):
logger.warning(
"UNSAFE: Skipping ExllamaV2 version check.\n"
"If you aren't a developer, please keep this off!"
)
else:
check_exllama_version()
# Enable CUDA malloc backend
if unwrap(developer_config.get("cuda_malloc_backend"), False):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
# Use Uvloop/Winloop
if unwrap(developer_config.get("uvloop"), False):
if platform.system() == "Windows":
from winloop import install
else:
from uvloop import install
# Set loop event policy
install()
logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.")
# Enter into the async event loop
asyncio.run(entrypoint_async())
if __name__ == "__main__":
asyncio.run(entrypoint())
entrypoint()

View file

@ -29,6 +29,10 @@ dependencies = [
"lm-format-enforcer >= 0.9.6",
"aiofiles",
# Improved asyncio loops
"uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
"winloop ; platform_system == 'Windows'",
# TEMP: Remove once 2.x is fixed in upstream
"numpy < 2.0.0",

View file

@ -1,6 +1,5 @@
"""Utility to automatically upgrade and start the API"""
import asyncio
import argparse
import os
import pathlib
@ -159,4 +158,5 @@ if __name__ == "__main__":
# Import entrypoint after installing all requirements
from main import entrypoint
asyncio.run(entrypoint(convert_args_to_dict(args, parser)))
converted_args = convert_args_to_dict(args, parser)
entrypoint(converted_args)