From 5c082b7e8c4570ceff108191b53b5497f7c00cf5 Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Wed, 24 Jul 2024 18:56:28 -0400
Subject: [PATCH] Async: Add option to use Uvloop/Winloop

These are faster event loops for asyncio which should improve overall
performance. Gate these under an experimental flag for now to stress
test these loops.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 common/args.py      |   7 ++-
 config_sample.yml   |   5 ++
 endpoints/server.py |   5 ++
 main.py             | 110 +++++++++++++++++++++++++-------------------
 pyproject.toml      |   4 ++
 start.py            |   4 +-
 6 files changed, 85 insertions(+), 50 deletions(-)

diff --git a/common/args.py b/common/args.py
index bbf007c..e57de78 100644
--- a/common/args.py
+++ b/common/args.py
@@ -193,7 +193,12 @@ def add_developer_args(parser: argparse.ArgumentParser):
     developer_group.add_argument(
         "--cuda-malloc-backend",
         type=str_to_bool,
-        help="Disables API request streaming",
+        help="Runs with the pytorch CUDA malloc backend",
+    )
+    developer_group.add_argument(
+        "--uvloop",
+        type=str_to_bool,
+        help="Run asyncio using Uvloop or Winloop",
     )
 
 
diff --git a/config_sample.yml b/config_sample.yml
index c5d6c9c..3070642 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -62,6 +62,11 @@ developer:
   # This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
   #cuda_malloc_backend: False
 
+  # Enable Uvloop or Winloop (default: False)
+  # Make the program utilize a faster async event loop which can improve performance
+  # NOTE: It's recommended to enable this, but if something breaks, turn this off.
+  #uvloop: False
+
 # Options for model overrides and loading
 # Please read the comments to understand how arguments are handled between initial and API loads
 model:
diff --git a/endpoints/server.py b/endpoints/server.py
index 4fc3f0b..401b211 100644
--- a/endpoints/server.py
+++ b/endpoints/server.py
@@ -1,3 +1,4 @@
+import asyncio
 import uvicorn
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
@@ -71,11 +72,15 @@ async def start_api(host: str, port: int):
     # Setup app
     app = setup_app()
 
+    # Get the current event loop
+    loop = asyncio.get_running_loop()
+
     config = uvicorn.Config(
         app,
         host=host,
         port=port,
         log_config=UVICORN_LOG_CONFIG,
+        loop=loop,
     )
     server = uvicorn.Server(config)
 
diff --git a/main.py b/main.py
index fd29297..2ec98db 100644
--- a/main.py
+++ b/main.py
@@ -1,10 +1,10 @@
 """The main tabbyAPI module. Contains the FastAPI server and endpoints."""
 
 import asyncio
-import aiofiles
 import json
 import os
 import pathlib
+import platform
 import signal
 from loguru import logger
 from typing import Optional
@@ -23,51 +23,8 @@ if not do_export_openapi:
     from backends.exllamav2.utils import check_exllama_version
 
 
-async def entrypoint(args: Optional[dict] = None):
-    """Entry function for program startup"""
-
-    setup_logger()
-
-    # Set up signal aborting
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    if os.getenv("EXPORT_OPENAPI", "").lower() in ("true", "1"):
-        openapi_json = export_openapi()
-
-        async with aiofiles.open("openapi.json", "w") as f:
-            await f.write(json.dumps(openapi_json))
-            logger.info("Successfully wrote OpenAPI spec to openapi.json")
-
-        return
-
-    # Load from YAML config
-    config.from_file(pathlib.Path("config.yml"))
-
-    # Parse and override config from args
-    if args is None:
-        parser = init_argparser()
-        args = convert_args_to_dict(parser.parse_args(), parser)
-
-    config.from_args(args)
-
-    developer_config = config.developer_config()
-
-    # Check exllamav2 version and give a descriptive error if it's too old
-    # Skip if launching unsafely
-
-    if unwrap(developer_config.get("unsafe_launch"), False):
-        logger.warning(
-            "UNSAFE: Skipping ExllamaV2 version check.\n"
-            "If you aren't a developer, please keep this off!"
-        )
-    else:
-        check_exllama_version()
-
-    # Enable CUDA malloc backend
-    if unwrap(developer_config.get("cuda_malloc_backend"), False):
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
-        logger.warning("Enabled the experimental CUDA malloc backend.")
+async def entrypoint_async():
+    """Async entry function for program startup"""
 
     network_config = config.network_config()
 
@@ -131,5 +88,64 @@ async def entrypoint(args: Optional[dict] = None):
     await start_api(host, port)
 
 
+def entrypoint(arguments: Optional[dict] = None):
+    setup_logger()
+
+    # Set up signal aborting
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    if do_export_openapi:
+        openapi_json = export_openapi()
+
+        with open("openapi.json", "w") as f:
+            f.write(json.dumps(openapi_json))
+            logger.info("Successfully wrote OpenAPI spec to openapi.json")
+
+        return
+
+    # Load from YAML config
+    config.from_file(pathlib.Path("config.yml"))
+
+    # Parse and override config from args
+    if arguments is None:
+        parser = init_argparser()
+        arguments = convert_args_to_dict(parser.parse_args(), parser)
+
+    config.from_args(arguments)
+    developer_config = config.developer_config()
+
+    # Check exllamav2 version and give a descriptive error if it's too old
+    # Skip if launching unsafely
+
+    if unwrap(developer_config.get("unsafe_launch"), False):
+        logger.warning(
+            "UNSAFE: Skipping ExllamaV2 version check.\n"
+            "If you aren't a developer, please keep this off!"
+        )
+    else:
+        check_exllama_version()
+
+    # Enable CUDA malloc backend
+    if unwrap(developer_config.get("cuda_malloc_backend"), False):
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
+        logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
+
+    # Use Uvloop/Winloop
+    if unwrap(developer_config.get("uvloop"), False):
+        if platform.system() == "Windows":
+            from winloop import install
+        else:
+            from uvloop import install
+
+        # Set loop event policy
+        install()
+
+        logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.")
+
+    # Enter into the async event loop
+    asyncio.run(entrypoint_async())
+
+
 if __name__ == "__main__":
-    asyncio.run(entrypoint())
+    entrypoint()
diff --git a/pyproject.toml b/pyproject.toml
index ecf6362..031e228 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,10 @@ dependencies = [
     "lm-format-enforcer >= 0.9.6",
     "aiofiles",
 
+    # Improved asyncio loops
+    "uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "winloop ; platform_system == 'Windows'",
+
     # TEMP: Remove once 2.x is fixed in upstream
     "numpy < 2.0.0",
 
diff --git a/start.py b/start.py
index d1f8843..ddf4d27 100644
--- a/start.py
+++ b/start.py
@@ -1,6 +1,5 @@
 """Utility to automatically upgrade and start the API"""
 
-import asyncio
 import argparse
 import os
 import pathlib
@@ -159,4 +158,5 @@ if __name__ == "__main__":
     # Import entrypoint after installing all requirements
     from main import entrypoint
 
-    asyncio.run(entrypoint(convert_args_to_dict(args, parser)))
+    converted_args = convert_args_to_dict(args, parser)
+    entrypoint(converted_args)