API: Auto-unload on a load request

Automatically unload the existing model when calling /load. This was
requested many times, and does make more sense in the long run.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri 2024-02-21 23:00:11 -05:00
parent 368eb2e2d9
commit bee26a2f2c
2 changed files with 14 additions and 4 deletions

View file

@ -464,6 +464,8 @@ class ExllamaV2Container:
gc.collect()
torch.cuda.empty_cache()
logger.info("Model unloaded.")
def encode_tokens(self, text: str, **kwargs):
"""Wrapper to encode tokens from a text string"""

16
main.py
View file

@ -172,11 +172,19 @@ async def load_model(request: Request, data: ModelLoadRequest):
"""Loads a model into the model container."""
global MODEL_CONTAINER
if MODEL_CONTAINER and MODEL_CONTAINER.model:
raise HTTPException(400, "A model is already loaded! Please unload it first.")
if not data.name:
raise HTTPException(400, "model_name not found.")
raise HTTPException(400, "A model name was not provided.")
# Unload the existing model
if MODEL_CONTAINER and MODEL_CONTAINER.model:
loaded_model_name = MODEL_CONTAINER.get_model_path().name
if loaded_model_name == data.name:
raise HTTPException(
400, f"Model \"{loaded_model_name}\"is already loaded! Aborting."
)
else:
MODEL_CONTAINER.unload()
model_path = pathlib.Path(unwrap(get_model_config().get("model_dir"), "models"))
model_path = model_path / data.name