Model: Have load take the highest priority

The admin takes priority over the regular user. Therefore, if a model is loading, ignore all incoming generation requests Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2025-04-18 22:08:48 -04:00 · 2025-04-18 22:08:48 -04:00 · 552a64c723
commit 552a64c723
parent 3f1d5d396e
1 changed files with 20 additions and 10 deletions
--- a/common/model.py
+++ b/common/model.py
@ -65,22 +65,27 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
        logger.info("Unloading existing model.")
        await unload_model()

+    # Reset to prepare for a new container
+    container = None
+
    # Merge with config defaults
    kwargs = {**config.model_defaults, **kwargs}

    # Create a new container
-    container = await ExllamaV2Container.create(model_path.resolve(), False, **kwargs)
+    new_container = await ExllamaV2Container.create(
+        model_path.resolve(), False, **kwargs
+    )

    # Add possible types of models that can be loaded
    model_type = [ModelType.MODEL]

-    if container.use_vision:
+    if new_container.use_vision:
        model_type.insert(0, ModelType.VISION)

-    if container.draft_config:
+    if new_container.draft_config:
        model_type.insert(0, ModelType.DRAFT)

-    load_status = container.load_gen(load_progress, **kwargs)
+    load_status = new_container.load_gen(load_progress, **kwargs)

    progress = get_loading_progress_bar()
    progress.start()
@ -104,6 +109,8 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
                    progress.stop()
                else:
                    index += 1
+
+        container = new_container
    finally:
        progress.stop()

@ -150,8 +157,13 @@ async def load_embedding_model(model_path: pathlib.Path, **kwargs):
        logger.info("Unloading existing embeddings model.")
        await unload_embedding_model()

-    embeddings_container = InfinityContainer(model_path)
-    await embeddings_container.load(**kwargs)
+    # Reset to prepare for a new container
+    embeddings_container = None
+
+    new_embeddings_container = InfinityContainer(model_path)
+    await new_embeddings_container.load(**kwargs)
+
+    embeddings_container = new_embeddings_container


 async def unload_embedding_model():
@ -164,7 +176,7 @@ async def unload_embedding_model():
 async def check_model_container():
    """FastAPI depends that checks if a model isn't loaded or currently loading."""

-    if container is None or not (container.model_is_loading or container.model_loaded):
+    if container is None:
        error_message = handle_request_error(
            "No models are currently loaded.",
            exc_info=False,
@ -180,9 +192,7 @@ async def check_embeddings_container():
    This is the same as the model container check, but with embeddings instead.
    """

-    if embeddings_container is None or not (
-        embeddings_container.model_is_loading or embeddings_container.model_loaded
-    ):
+    if embeddings_container is None:
        error_message = handle_request_error(
            "No embedding models are currently loaded.",
            exc_info=False,