Model: Reject model requests when unloading
If a model is being unloaded, that means its being shut down and no requests should be accepted from then on. Also, remove model_is_loaded since we simply check if the container is None now. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
parent
552a64c723
commit
f15ac1f69d
2 changed files with 13 additions and 15 deletions
|
|
@ -7,7 +7,6 @@ import math
|
|||
import pathlib
|
||||
import traceback
|
||||
import torch
|
||||
import uuid
|
||||
from exllamav2 import (
|
||||
ExLlamaV2,
|
||||
ExLlamaV2Config,
|
||||
|
|
@ -99,14 +98,12 @@ class ExllamaV2Container:
|
|||
use_vision: bool = False
|
||||
vision_model: Optional[ExLlamaV2VisionTower] = None
|
||||
|
||||
# Load state
|
||||
model_is_loading: bool = False
|
||||
model_loaded: bool = False
|
||||
|
||||
# Load synchronization
|
||||
# The bool is a master switch for accepting requests
|
||||
# The lock keeps load tasks sequential
|
||||
# The condition notifies any waiting tasks
|
||||
active_job_ids: Dict[str, Optional[ExLlamaV2DynamicJobAsync]] = {}
|
||||
loaded: bool = False
|
||||
load_lock: asyncio.Lock = asyncio.Lock()
|
||||
load_condition: asyncio.Condition = asyncio.Condition()
|
||||
|
||||
|
|
@ -560,7 +557,6 @@ class ExllamaV2Container:
|
|||
# Do this operation under the load lock's context
|
||||
try:
|
||||
await self.load_lock.acquire()
|
||||
self.model_is_loading = True
|
||||
|
||||
# Wait for existing generation jobs to finish
|
||||
await self.wait_for_jobs(kwargs.get("skip_wait"))
|
||||
|
|
@ -579,11 +575,10 @@ class ExllamaV2Container:
|
|||
torch.cuda.empty_cache()
|
||||
|
||||
# Cleanup and update model load state
|
||||
self.model_loaded = True
|
||||
self.loaded = True
|
||||
logger.info("Model successfully loaded.")
|
||||
finally:
|
||||
self.load_lock.release()
|
||||
self.model_is_loading = False
|
||||
|
||||
async with self.load_condition:
|
||||
self.load_condition.notify_all()
|
||||
|
|
@ -773,7 +768,7 @@ class ExllamaV2Container:
|
|||
|
||||
try:
|
||||
# Don't acquire locks unless a model is loaded
|
||||
if self.model_loaded:
|
||||
if self.loaded:
|
||||
await self.load_lock.acquire()
|
||||
|
||||
# Immediately cancel all jobs
|
||||
|
|
@ -796,7 +791,7 @@ class ExllamaV2Container:
|
|||
finally:
|
||||
# This means the generator is being recreated
|
||||
# The load lock is already released in the load function
|
||||
if self.model_loaded:
|
||||
if self.loaded:
|
||||
self.load_lock.release()
|
||||
|
||||
async with self.load_condition:
|
||||
|
|
@ -905,8 +900,7 @@ class ExllamaV2Container:
|
|||
self.generator = None
|
||||
|
||||
# Set all model state variables to False
|
||||
self.model_is_loading = False
|
||||
self.model_loaded = False
|
||||
self.loaded = False
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
|
@ -1233,9 +1227,16 @@ class ExllamaV2Container:
|
|||
"""
|
||||
|
||||
# Wait for load lock to be freed before processing
|
||||
# Mainly used for loras and other operations where the class is available
|
||||
async with self.load_condition:
|
||||
await self.load_condition.wait_for(lambda: not self.load_lock.locked())
|
||||
|
||||
# If the model is being unloaded, don't accept new requests
|
||||
if not self.loaded:
|
||||
raise RuntimeError(
|
||||
"Model is being unloaded. Cannot process new generation requests."
|
||||
)
|
||||
|
||||
# Mark that the job is running
|
||||
self.active_job_ids[request_id] = None
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@ if dependencies.extras:
|
|||
|
||||
class InfinityContainer:
|
||||
model_dir: pathlib.Path
|
||||
model_is_loading: bool = False
|
||||
model_loaded: bool = False
|
||||
|
||||
# Use a runtime type hint here
|
||||
|
|
@ -24,8 +23,6 @@ class InfinityContainer:
|
|||
self.model_dir = model_directory
|
||||
|
||||
async def load(self, **kwargs):
|
||||
self.model_is_loading = True
|
||||
|
||||
# Use cpu by default
|
||||
device = unwrap(kwargs.get("embeddings_device"), "cpu")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue