From b967e2e604b88da94402ec45fd4f520fd2a3a0f0 Mon Sep 17 00:00:00 2001 From: david Date: Thu, 9 Nov 2023 21:27:45 -0600 Subject: [PATCH] Initial --- README.md | 0 llm.py | 38 ++++++++++++++++++++++++++++++++++++++ main.py | 39 +++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +++ 4 files changed, 80 insertions(+) create mode 100644 README.md create mode 100644 llm.py create mode 100644 main.py create mode 100644 requirements.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/llm.py b/llm.py new file mode 100644 index 0000000..b0a2336 --- /dev/null +++ b/llm.py @@ -0,0 +1,38 @@ +# exllama.py +import random +from exllamav2 import ( + ExLlamaV2, + ExLlamaV2Config, + ExLlamaV2Cache, + ExLlamaV2Tokenizer, +) +from exllamav2.generator import ( + ExLlamaV2BaseGenerator, + ExLlamaV2Sampler +) +import time +class ModelManager: + def __init__(self, model_directory: str = None): + if model_directory is None: + model_directory = "/mnt/str/models/_exl2/mistral-7b-instruct-exl2/4.0bpw/" + self.config = ExLlamaV2Config() + self.config.model_dir = model_directory + self.config.prepare() + self.model = ExLlamaV2(self.config) + print("Loading model: " + model_directory) + self.cache = ExLlamaV2Cache(self.model, lazy=True) + self.model.load_autosplit(self.cache) + self.tokenizer = ExLlamaV2Tokenizer(self.config) + self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer) + def generate_text(self, prompt: str, max_new_tokens: int = 150,seed: int = random.randint(0,999999) ): + try: + self.generator.warmup() + time_begin = time.time() + output = self.generator.generate_simple( + prompt, ExLlamaV2Sampler.Settings(), max_new_tokens, seed=seed + ) + time_end = time.time() + time_total = time_end - time_begin + return output, f"{time_total:.2f} seconds" + except Exception as e: + raise RuntimeError(f"Error generating text: {str(e)}") diff --git a/main.py b/main.py new file mode 100644 index 0000000..1a36284 --- /dev/null +++ b/main.py @@ -0,0 +1,39 @@ +# main.py + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +from llm import ModelManager +from uvicorn import run + +app = FastAPI() + +# Example: Using a different model directory +modelManager = ModelManager("/home/david/Models/SynthIA-7B-v2.0-5.0bpw-h6-exl2") + +class TextRequest(BaseModel): + model: str + messages: list[dict] + temperature: float + +class TextResponse(BaseModel): + response: str + generation_time: str + +@app.post("/generate-text", response_model=TextResponse) +def generate_text(request: TextRequest): + try: + #model_path = request.model # You can use this path to load a specific model if needed + messages = request.messages + #temperature = request.temperature + + # Assuming you need to extract the user's message from the messages list + user_message = next(msg["content"] for msg in messages if msg["role"] == "user") + + # You can then use user_message as the prompt for generation + output, generation_time = modelManager.generate_text(user_message) + return {"response": output, "generation_time": generation_time} + except RuntimeError as e: + raise HTTPException(status_code=500, detail=str(e)) + +if __name__ == "__main__": + run(app, host="0.0.0.0", port=8012, reload=True) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..56744dc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +fastapi==0.104.1 +uvicorn==0.24.0 +pydantic==2.4.2