Async: Add option to use Uvloop/Winloop
These are faster event loops for asyncio which should improve overall performance. Gate these under an experimental flag for now to stress test these loops. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
parent
71de3060bb
commit
5c082b7e8c
6 changed files with 85 additions and 50 deletions
|
|
@ -193,7 +193,12 @@ def add_developer_args(parser: argparse.ArgumentParser):
|
|||
developer_group.add_argument(
|
||||
"--cuda-malloc-backend",
|
||||
type=str_to_bool,
|
||||
help="Disables API request streaming",
|
||||
help="Runs with the pytorch CUDA malloc backend",
|
||||
)
|
||||
developer_group.add_argument(
|
||||
"--uvloop",
|
||||
type=str_to_bool,
|
||||
help="Run asyncio using Uvloop or Winloop",
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -62,6 +62,11 @@ developer:
|
|||
# This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
|
||||
#cuda_malloc_backend: False
|
||||
|
||||
# Enable Uvloop or Winloop (default: False)
|
||||
# Make the program utilize a faster async event loop which can improve performance
|
||||
# NOTE: It's recommended to enable this, but if something breaks, turn this off.
|
||||
#uvloop: False
|
||||
|
||||
# Options for model overrides and loading
|
||||
# Please read the comments to understand how arguments are handled between initial and API loads
|
||||
model:
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import asyncio
|
||||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
|
@ -71,11 +72,15 @@ async def start_api(host: str, port: int):
|
|||
# Setup app
|
||||
app = setup_app()
|
||||
|
||||
# Get the current event loop
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
config = uvicorn.Config(
|
||||
app,
|
||||
host=host,
|
||||
port=port,
|
||||
log_config=UVICORN_LOG_CONFIG,
|
||||
loop=loop,
|
||||
)
|
||||
server = uvicorn.Server(config)
|
||||
|
||||
|
|
|
|||
110
main.py
110
main.py
|
|
@ -1,10 +1,10 @@
|
|||
"""The main tabbyAPI module. Contains the FastAPI server and endpoints."""
|
||||
|
||||
import asyncio
|
||||
import aiofiles
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import platform
|
||||
import signal
|
||||
from loguru import logger
|
||||
from typing import Optional
|
||||
|
|
@ -23,51 +23,8 @@ if not do_export_openapi:
|
|||
from backends.exllamav2.utils import check_exllama_version
|
||||
|
||||
|
||||
async def entrypoint(args: Optional[dict] = None):
|
||||
"""Entry function for program startup"""
|
||||
|
||||
setup_logger()
|
||||
|
||||
# Set up signal aborting
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
if os.getenv("EXPORT_OPENAPI", "").lower() in ("true", "1"):
|
||||
openapi_json = export_openapi()
|
||||
|
||||
async with aiofiles.open("openapi.json", "w") as f:
|
||||
await f.write(json.dumps(openapi_json))
|
||||
logger.info("Successfully wrote OpenAPI spec to openapi.json")
|
||||
|
||||
return
|
||||
|
||||
# Load from YAML config
|
||||
config.from_file(pathlib.Path("config.yml"))
|
||||
|
||||
# Parse and override config from args
|
||||
if args is None:
|
||||
parser = init_argparser()
|
||||
args = convert_args_to_dict(parser.parse_args(), parser)
|
||||
|
||||
config.from_args(args)
|
||||
|
||||
developer_config = config.developer_config()
|
||||
|
||||
# Check exllamav2 version and give a descriptive error if it's too old
|
||||
# Skip if launching unsafely
|
||||
|
||||
if unwrap(developer_config.get("unsafe_launch"), False):
|
||||
logger.warning(
|
||||
"UNSAFE: Skipping ExllamaV2 version check.\n"
|
||||
"If you aren't a developer, please keep this off!"
|
||||
)
|
||||
else:
|
||||
check_exllama_version()
|
||||
|
||||
# Enable CUDA malloc backend
|
||||
if unwrap(developer_config.get("cuda_malloc_backend"), False):
|
||||
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
|
||||
logger.warning("Enabled the experimental CUDA malloc backend.")
|
||||
async def entrypoint_async():
|
||||
"""Async entry function for program startup"""
|
||||
|
||||
network_config = config.network_config()
|
||||
|
||||
|
|
@ -131,5 +88,64 @@ async def entrypoint(args: Optional[dict] = None):
|
|||
await start_api(host, port)
|
||||
|
||||
|
||||
def entrypoint(arguments: Optional[dict] = None):
|
||||
setup_logger()
|
||||
|
||||
# Set up signal aborting
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
if do_export_openapi:
|
||||
openapi_json = export_openapi()
|
||||
|
||||
with open("openapi.json", "w") as f:
|
||||
f.write(json.dumps(openapi_json))
|
||||
logger.info("Successfully wrote OpenAPI spec to openapi.json")
|
||||
|
||||
return
|
||||
|
||||
# Load from YAML config
|
||||
config.from_file(pathlib.Path("config.yml"))
|
||||
|
||||
# Parse and override config from args
|
||||
if arguments is None:
|
||||
parser = init_argparser()
|
||||
arguments = convert_args_to_dict(parser.parse_args(), parser)
|
||||
|
||||
config.from_args(arguments)
|
||||
developer_config = config.developer_config()
|
||||
|
||||
# Check exllamav2 version and give a descriptive error if it's too old
|
||||
# Skip if launching unsafely
|
||||
|
||||
if unwrap(developer_config.get("unsafe_launch"), False):
|
||||
logger.warning(
|
||||
"UNSAFE: Skipping ExllamaV2 version check.\n"
|
||||
"If you aren't a developer, please keep this off!"
|
||||
)
|
||||
else:
|
||||
check_exllama_version()
|
||||
|
||||
# Enable CUDA malloc backend
|
||||
if unwrap(developer_config.get("cuda_malloc_backend"), False):
|
||||
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
|
||||
logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
|
||||
|
||||
# Use Uvloop/Winloop
|
||||
if unwrap(developer_config.get("uvloop"), False):
|
||||
if platform.system() == "Windows":
|
||||
from winloop import install
|
||||
else:
|
||||
from uvloop import install
|
||||
|
||||
# Set loop event policy
|
||||
install()
|
||||
|
||||
logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.")
|
||||
|
||||
# Enter into the async event loop
|
||||
asyncio.run(entrypoint_async())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(entrypoint())
|
||||
entrypoint()
|
||||
|
|
|
|||
|
|
@ -29,6 +29,10 @@ dependencies = [
|
|||
"lm-format-enforcer >= 0.9.6",
|
||||
"aiofiles",
|
||||
|
||||
# Improved asyncio loops
|
||||
"uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
|
||||
"winloop ; platform_system == 'Windows'",
|
||||
|
||||
# TEMP: Remove once 2.x is fixed in upstream
|
||||
"numpy < 2.0.0",
|
||||
|
||||
|
|
|
|||
4
start.py
4
start.py
|
|
@ -1,6 +1,5 @@
|
|||
"""Utility to automatically upgrade and start the API"""
|
||||
|
||||
import asyncio
|
||||
import argparse
|
||||
import os
|
||||
import pathlib
|
||||
|
|
@ -159,4 +158,5 @@ if __name__ == "__main__":
|
|||
# Import entrypoint after installing all requirements
|
||||
from main import entrypoint
|
||||
|
||||
asyncio.run(entrypoint(convert_args_to_dict(args, parser)))
|
||||
converted_args = convert_args_to_dict(args, parser)
|
||||
entrypoint(converted_args)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue