Model: Add proper jobs cleanup and fix var calls

Jobs should be started and immediately cleaned up when calling the
generation stream. Expose a stream_generate function and append
this to the base class since it's more idiomatic than generate_gen.

The exl2 container's generate_gen function is now internal.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
kingbri 2025-04-24 21:30:55 -04:00
parent 7e007f0761
commit f070587e9f
6 changed files with 45 additions and 26 deletions

View file

@ -52,7 +52,7 @@ async def _stream_collector(data: GenerateRequest, request: Request):
try:
logger.info(f"Received Kobold generation request {data.genkey}")
generator = model.container.generate_gen(
generator = model.container.stream_generate(
request_id=data.genkey, abort_event=abort_event, **data.model_dump()
)
async for generation in generator:

View file

@ -95,7 +95,7 @@ async def _stream_collector(
"""Collects a stream and places results in a common queue"""
try:
new_generation = model.container.generate_gen(
new_generation = model.container.stream_generate(
request_id,
prompt,
params,
@ -120,7 +120,7 @@ async def load_inline_model(model_name: str, request: Request):
if (
model.container
and model.container.model_dir.name == model_name
and model.container.model_loaded
and model.container.loaded
):
return