Async: Add option to use Uvloop/Winloop

These are faster event loops for asyncio which should improve overall performance. Gate these under an experimental flag for now to stress test these loops. Signed-off-by: kingbri <bdashore3@proton.me>
2024-07-24 18:56:28 -04:00 · 2024-07-24 18:56:28 -04:00 · 5c082b7e8c
commit 5c082b7e8c
parent 71de3060bb
6 changed files with 85 additions and 50 deletions
--- a/common/args.py
+++ b/common/args.py
@ -193,7 +193,12 @@ def add_developer_args(parser: argparse.ArgumentParser):
    developer_group.add_argument(
        "--cuda-malloc-backend",
        type=str_to_bool,
-        help="Disables API request streaming",
+        help="Runs with the pytorch CUDA malloc backend",
+    )
+    developer_group.add_argument(
+        "--uvloop",
+        type=str_to_bool,
+        help="Run asyncio using Uvloop or Winloop",
    )


--- a/config_sample.yml
+++ b/config_sample.yml
@ -62,6 +62,11 @@ developer:
  # This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
  #cuda_malloc_backend: False

+  # Enable Uvloop or Winloop (default: False)
+  # Make the program utilize a faster async event loop which can improve performance
+  # NOTE: It's recommended to enable this, but if something breaks, turn this off.
+  #uvloop: False
+
 # Options for model overrides and loading
 # Please read the comments to understand how arguments are handled between initial and API loads
 model:
--- a/endpoints/server.py
+++ b/endpoints/server.py
@ -1,3 +1,4 @@
+import asyncio
 import uvicorn
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
@ -71,11 +72,15 @@ async def start_api(host: str, port: int):
    # Setup app
    app = setup_app()

+    # Get the current event loop
+    loop = asyncio.get_running_loop()
+
    config = uvicorn.Config(
        app,
        host=host,
        port=port,
        log_config=UVICORN_LOG_CONFIG,
+        loop=loop,
    )
    server = uvicorn.Server(config)

--- a/main.py
+++ b/main.py
@ -1,10 +1,10 @@
 """The main tabbyAPI module. Contains the FastAPI server and endpoints."""

 import asyncio
-import aiofiles
 import json
 import os
 import pathlib
+import platform
 import signal
 from loguru import logger
 from typing import Optional
@ -23,51 +23,8 @@ if not do_export_openapi:
    from backends.exllamav2.utils import check_exllama_version


-async def entrypoint(args: Optional[dict] = None):
-    """Entry function for program startup"""
-
-    setup_logger()
-
-    # Set up signal aborting
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    if os.getenv("EXPORT_OPENAPI", "").lower() in ("true", "1"):
-        openapi_json = export_openapi()
-
-        async with aiofiles.open("openapi.json", "w") as f:
-            await f.write(json.dumps(openapi_json))
-            logger.info("Successfully wrote OpenAPI spec to openapi.json")
-
-        return
-
-    # Load from YAML config
-    config.from_file(pathlib.Path("config.yml"))
-
-    # Parse and override config from args
-    if args is None:
-        parser = init_argparser()
-        args = convert_args_to_dict(parser.parse_args(), parser)
-
-    config.from_args(args)
-
-    developer_config = config.developer_config()
-
-    # Check exllamav2 version and give a descriptive error if it's too old
-    # Skip if launching unsafely
-
-    if unwrap(developer_config.get("unsafe_launch"), False):
-        logger.warning(
-            "UNSAFE: Skipping ExllamaV2 version check.\n"
-            "If you aren't a developer, please keep this off!"
-        )
-    else:
-        check_exllama_version()
-
-    # Enable CUDA malloc backend
-    if unwrap(developer_config.get("cuda_malloc_backend"), False):
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
-        logger.warning("Enabled the experimental CUDA malloc backend.")
+async def entrypoint_async():
+    """Async entry function for program startup"""

    network_config = config.network_config()

@ -131,5 +88,64 @@ async def entrypoint(args: Optional[dict] = None):
    await start_api(host, port)


+def entrypoint(arguments: Optional[dict] = None):
+    setup_logger()
+
+    # Set up signal aborting
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    if do_export_openapi:
+        openapi_json = export_openapi()
+
+        with open("openapi.json", "w") as f:
+            f.write(json.dumps(openapi_json))
+            logger.info("Successfully wrote OpenAPI spec to openapi.json")
+
+        return
+
+    # Load from YAML config
+    config.from_file(pathlib.Path("config.yml"))
+
+    # Parse and override config from args
+    if arguments is None:
+        parser = init_argparser()
+        arguments = convert_args_to_dict(parser.parse_args(), parser)
+
+    config.from_args(arguments)
+    developer_config = config.developer_config()
+
+    # Check exllamav2 version and give a descriptive error if it's too old
+    # Skip if launching unsafely
+
+    if unwrap(developer_config.get("unsafe_launch"), False):
+        logger.warning(
+            "UNSAFE: Skipping ExllamaV2 version check.\n"
+            "If you aren't a developer, please keep this off!"
+        )
+    else:
+        check_exllama_version()
+
+    # Enable CUDA malloc backend
+    if unwrap(developer_config.get("cuda_malloc_backend"), False):
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
+        logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
+
+    # Use Uvloop/Winloop
+    if unwrap(developer_config.get("uvloop"), False):
+        if platform.system() == "Windows":
+            from winloop import install
+        else:
+            from uvloop import install
+
+        # Set loop event policy
+        install()
+
+        logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.")
+
+    # Enter into the async event loop
+    asyncio.run(entrypoint_async())
+
+
 if __name__ == "__main__":
-    asyncio.run(entrypoint())
+    entrypoint()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -29,6 +29,10 @@ dependencies = [
    "lm-format-enforcer >= 0.9.6",
    "aiofiles",

+    # Improved asyncio loops
+    "uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "winloop ; platform_system == 'Windows'",
+
    # TEMP: Remove once 2.x is fixed in upstream
    "numpy < 2.0.0",

--- a/start.py
+++ b/start.py
@ -1,6 +1,5 @@
 """Utility to automatically upgrade and start the API"""

-import asyncio
 import argparse
 import os
 import pathlib
@ -159,4 +158,5 @@ if __name__ == "__main__":
    # Import entrypoint after installing all requirements
    from main import entrypoint

-    asyncio.run(entrypoint(convert_args_to_dict(args, parser)))
+    converted_args = convert_args_to_dict(args, parser)
+    entrypoint(converted_args)