Model: Reject model requests when unloading

If a model is being unloaded, that means its being shut down and no requests should be accepted from then on. Also, remove model_is_loaded since we simply check if the container is None now. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2025-04-19 22:34:06 -04:00 · 2025-04-19 22:34:06 -04:00 · f15ac1f69d
commit f15ac1f69d
parent 552a64c723
2 changed files with 13 additions and 15 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@ -7,7 +7,6 @@ import math
 import pathlib
 import traceback
 import torch
-import uuid
 from exllamav2 import (
    ExLlamaV2,
    ExLlamaV2Config,
@ -99,14 +98,12 @@ class ExllamaV2Container:
    use_vision: bool = False
    vision_model: Optional[ExLlamaV2VisionTower] = None

-    # Load state
-    model_is_loading: bool = False
-    model_loaded: bool = False
-
    # Load synchronization
+    # The bool is a master switch for accepting requests
    # The lock keeps load tasks sequential
    # The condition notifies any waiting tasks
    active_job_ids: Dict[str, Optional[ExLlamaV2DynamicJobAsync]] = {}
+    loaded: bool = False
    load_lock: asyncio.Lock = asyncio.Lock()
    load_condition: asyncio.Condition = asyncio.Condition()

@ -560,7 +557,6 @@ class ExllamaV2Container:
        # Do this operation under the load lock's context
        try:
            await self.load_lock.acquire()
-            self.model_is_loading = True

            # Wait for existing generation jobs to finish
            await self.wait_for_jobs(kwargs.get("skip_wait"))
@ -579,11 +575,10 @@ class ExllamaV2Container:
            torch.cuda.empty_cache()

            # Cleanup and update model load state
-            self.model_loaded = True
+            self.loaded = True
            logger.info("Model successfully loaded.")
        finally:
            self.load_lock.release()
-            self.model_is_loading = False

            async with self.load_condition:
                self.load_condition.notify_all()
@ -773,7 +768,7 @@ class ExllamaV2Container:

        try:
            # Don't acquire locks unless a model is loaded
-            if self.model_loaded:
+            if self.loaded:
                await self.load_lock.acquire()

                # Immediately cancel all jobs
@ -796,7 +791,7 @@ class ExllamaV2Container:
        finally:
            # This means the generator is being recreated
            # The load lock is already released in the load function
-            if self.model_loaded:
+            if self.loaded:
                self.load_lock.release()

                async with self.load_condition:
@ -905,8 +900,7 @@ class ExllamaV2Container:
                    self.generator = None

                # Set all model state variables to False
-                self.model_is_loading = False
-                self.model_loaded = False
+                self.loaded = False

            gc.collect()
            torch.cuda.empty_cache()
@ -1233,9 +1227,16 @@ class ExllamaV2Container:
        """

        # Wait for load lock to be freed before processing
+        # Mainly used for loras and other operations where the class is available
        async with self.load_condition:
            await self.load_condition.wait_for(lambda: not self.load_lock.locked())

+        # If the model is being unloaded, don't accept new requests
+        if not self.loaded:
+            raise RuntimeError(
+                "Model is being unloaded. Cannot process new generation requests."
+            )
+
        # Mark that the job is running
        self.active_job_ids[request_id] = None

--- a/backends/infinity/model.py
+++ b/backends/infinity/model.py
@ -14,7 +14,6 @@ if dependencies.extras:

 class InfinityContainer:
    model_dir: pathlib.Path
-    model_is_loading: bool = False
    model_loaded: bool = False

    # Use a runtime type hint here
@ -24,8 +23,6 @@ class InfinityContainer:
        self.model_dir = model_directory

    async def load(self, **kwargs):
-        self.model_is_loading = True
-
        # Use cpu by default
        device = unwrap(kwargs.get("embeddings_device"), "cpu")