Model: Create generator on load

This commit is contained in:
turboderp 2025-05-03 18:32:51 +02:00
parent 0d949d00b9
commit 58c380b8ca

View file

@ -345,6 +345,9 @@ class ExllamaV3Container(BaseModelContainer):
async for value in iterate_in_threadpool(generator):
yield value
# Create async generator
await self.create_generator()
# Clean up any extra vram usage from torch and cuda
# (Helps reduce VRAM bottlenecking on Windows)
gc.collect()
@ -774,12 +777,6 @@ class ExllamaV3Container(BaseModelContainer):
f"max_seq_len {self.max_seq_len}"
)
self.generator = AsyncGenerator(
model=self.model,
cache=self.cache,
tokenizer=self.tokenizer,
)
generation = {}
job = AsyncJob(
self.generator,