Config: Switch to YAML and add load progress
YAML is a more flexible format when it comes to configuration. Commandline arguments are difficult to remember and configure especially for an API with complicated commandline names. Rather than using half-baked textfiles, implement a proper config solution. Also add a progress bar when loading models in the commandline. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
parent
5d32aa02cd
commit
a10c14d357
6 changed files with 38 additions and 26 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
|
@ -173,4 +173,7 @@ poetry.toml
|
|||
# LSP config files
|
||||
pyrightconfig.json
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/python
|
||||
# End of https://www.toptal.com/developers/gitignore/api/python
|
||||
|
||||
# User configuration
|
||||
config.yml
|
||||
|
|
|
|||
8
config_sample.yml
Normal file
8
config_sample.yml
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
model_dir: "D:/models"
|
||||
model_name: "this_is_a_exl2_model"
|
||||
max_seq_len: 4096
|
||||
gpu_split: "auto"
|
||||
rope_scale: 1.0
|
||||
rope_alpha: 1.0
|
||||
no_flash_attention: False
|
||||
low_mem: False
|
||||
39
main.py
39
main.py
|
|
@ -1,10 +1,9 @@
|
|||
import os
|
||||
import argparse
|
||||
import uvicorn
|
||||
import yaml
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from model import ModelContainer
|
||||
from utils import add_args
|
||||
from progress.bar import IncrementalBar
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
|
@ -38,22 +37,30 @@ def generate_text(request: TextRequest):
|
|||
except RuntimeError as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
# Debug progress check
|
||||
def progress(module, modules):
|
||||
print(f"Loaded {module}/{modules} modules")
|
||||
yield
|
||||
# Wrapper callback for load progress
|
||||
def load_progress(module, modules):
|
||||
yield module, modules
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Convert this parser to use a YAML config
|
||||
parser = argparse.ArgumentParser(description = "TabbyAPI - An API server for exllamav2")
|
||||
add_args(parser)
|
||||
args = parser.parse_args()
|
||||
# Load from YAML config. Possibly add a config -> kwargs conversion function
|
||||
with open('config.yml', 'r') as config_file:
|
||||
config = yaml.safe_load(config_file)
|
||||
|
||||
# If an initial model name is specified, create a container and load the model
|
||||
if config["model_name"]:
|
||||
model_path = f"{config['model_dir']}/{config['model_name']}" if config['model_dir'] else f"models/{config['model_name']}"
|
||||
|
||||
model_container = ModelContainer(model_path, False, **config)
|
||||
load_status = model_container.load_gen(load_progress)
|
||||
for (module, modules) in load_status:
|
||||
if module == 0:
|
||||
loading_bar: IncrementalBar = IncrementalBar("Modules", max = modules)
|
||||
else:
|
||||
loading_bar.next()
|
||||
|
||||
if module == modules:
|
||||
loading_bar.finish()
|
||||
|
||||
# If an initial model dir is specified, create a container and load the model
|
||||
if args.model_dir:
|
||||
model_container = ModelContainer(args.model_dir, False, **vars(args))
|
||||
print("Loading an initial model...")
|
||||
model_container.load(progress)
|
||||
print("Model successfully loaded.")
|
||||
|
||||
# Reload is for dev purposes ONLY!
|
||||
|
|
|
|||
4
model.py
4
model.py
|
|
@ -34,7 +34,6 @@ class ModelContainer:
|
|||
gpu_split: list or None = None
|
||||
|
||||
def __init__(self, model_directory: str, quiet = False, **kwargs):
|
||||
print(kwargs)
|
||||
"""
|
||||
Create model container
|
||||
|
||||
|
|
@ -76,6 +75,9 @@ class ModelContainer:
|
|||
if "rope_alpha" in kwargs: self.config.scale_alpha_value = kwargs["rope_alpha"]
|
||||
if "no_flash_attn" in kwargs: self.config.no_flash_attn = kwargs["no_flash_attn"]
|
||||
|
||||
if "low_mem" in kwargs and kwargs["low_mem"]:
|
||||
self.config.set_low_mem()
|
||||
|
||||
chunk_size = min(kwargs.get("chunk_size", 2048), self.config.max_seq_len)
|
||||
self.config.max_input_len = chunk_size
|
||||
self.config.max_attn_size = chunk_size ** 2
|
||||
|
|
|
|||
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
8
utils.py
8
utils.py
|
|
@ -1,8 +0,0 @@
|
|||
def add_args(parser):
|
||||
parser.add_argument("-m", "--model_dir", type = str, help = "Path to model directory")
|
||||
parser.add_argument("-gs", "--gpu_split", type = str, help = "\"auto\", or VRAM allocation per GPU in GB")
|
||||
parser.add_argument("-l", "--max_seq_len", type = int, help = "Maximum sequence length")
|
||||
parser.add_argument("-rs", "--rope_scale", type = float, default = 1.0, help = "RoPE scaling factor")
|
||||
parser.add_argument("-ra", "--rope_alpha", type = float, default = 1.0, help = "RoPE alpha value (NTK)")
|
||||
parser.add_argument("-nfa", "--no_flash_attn", action = "store_true", help = "Disable Flash Attention")
|
||||
parser.add_argument("-lm", "--low_mem", action = "store_true", help = "Enable VRAM optimizations, potentially trading off speed")
|
||||
Loading…
Add table
Add a link
Reference in a new issue