From 58c380b8ca649aa1670fa9730b7935fc0143d2bd Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sat, 3 May 2025 18:32:51 +0200 Subject: [PATCH] Model: Create generator on load --- backends/exllamav3/model.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index e430c04..09e5585 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -345,6 +345,9 @@ class ExllamaV3Container(BaseModelContainer): async for value in iterate_in_threadpool(generator): yield value + # Create async generator + await self.create_generator() + # Clean up any extra vram usage from torch and cuda # (Helps reduce VRAM bottlenecking on Windows) gc.collect() @@ -774,12 +777,6 @@ class ExllamaV3Container(BaseModelContainer): f"max_seq_len {self.max_seq_len}" ) - self.generator = AsyncGenerator( - model=self.model, - cache=self.cache, - tokenizer=self.tokenizer, - ) - generation = {} job = AsyncJob( self.generator,