tabbyAPI-ollama/endpoints/OAI/utils/embeddings.py
kingbri 3f21d9ef96 Embeddings: Switch to Infinity
Infinity-emb is an async batching engine for embeddings. This is
preferable to sentence-transformers since it handles scalable usecases
without the need for external thread intervention.

Signed-off-by: kingbri <bdashore3@proton.me>
2024-07-29 13:42:03 -04:00

117 lines
3.5 KiB
Python

"""
This file is derived from
[text-generation-webui openai extension embeddings](https://github.com/oobabooga/text-generation-webui/blob/1a7c027386f43b84f3ca3b0ff04ca48d861c2d7a/extensions/openai/embeddings.py)
and modified.
The changes introduced are: Suppression of progress bar,
typing/pydantic classes moved into this file,
embeddings function declared async.
"""
import asyncio
import os
import base64
import pathlib
from loguru import logger
import numpy as np
from transformers import AutoModel
from common import config
from common.utils import unwrap
from endpoints.OAI.types.embedding import (
EmbeddingObject,
EmbeddingsRequest,
EmbeddingsResponse,
)
embeddings_model = None
def load_embedding_model(model_path: pathlib.Path, device: str):
try:
from infinity_emb import EngineArgs, AsyncEmbeddingEngine
except ModuleNotFoundError:
logger.error(
"Skipping embeddings because infinity-emb is not installed.\n"
"Please run the following command in your environment "
"to install extra packages:\n"
"pip install -U .[extras]"
)
raise ModuleNotFoundError from None
global embeddings_model
try:
engine_args = EngineArgs(
model_name_or_path=str(model_path.resolve()),
engine="torch",
device="cpu",
bettertransformer=False,
model_warmup=False,
)
embeddings_model = AsyncEmbeddingEngine.from_args(engine_args)
logger.info(f"Trying to load embeddings model: {model_path.name} on {device}")
except Exception as e:
embeddings_model = None
raise e
async def embeddings(data: EmbeddingsRequest) -> dict:
embeddings_config = config.embeddings_config()
# Use CPU by default
device = embeddings_config.get("embeddings_device", "cpu")
if device == "auto":
device = None
model_path = pathlib.Path(embeddings_config.get("embeddings_model_dir"))
model_path: pathlib.Path = model_path / embeddings_config.get(
"embeddings_model_name"
)
if not model_path:
logger.info("Embeddings model path not found")
load_embedding_model(model_path, device)
async with embeddings_model:
embeddings, usage = await embeddings_model.embed(data.input)
# OAI expects a return of base64 if the input is base64
if data.encoding_format == "base64":
embedding_data = [
{
"object": "embedding",
"embedding": float_list_to_base64(emb),
"index": n,
}
for n, emb in enumerate(embeddings)
]
else:
embedding_data = [
{"object": "embedding", "embedding": emb.tolist(), "index": n}
for n, emb in enumerate(embeddings)
]
response = {
"object": "list",
"data": embedding_data,
"model": model_path.name,
"usage": {
"prompt_tokens": usage,
"total_tokens": usage,
},
}
return response
def float_list_to_base64(float_array: np.ndarray) -> str:
"""
Converts the provided list to a float32 array for OpenAI
Ex. float_array = np.array(float_list, dtype="float32")
"""
# Encode raw bytes into base64
encoded_bytes = base64.b64encode(float_array.tobytes())
# Turn raw base64 encoded bytes into ASCII
ascii_string = encoded_bytes.decode("ascii")
return ascii_string