Clear tokenizer_data cache when unloading model

This commit is contained in:
turboderp 2024-07-08 03:31:05 +02:00
parent b7e7df1220
commit 4cf79c5ae1
2 changed files with 13 additions and 1 deletions

View file

@ -56,6 +56,11 @@ class ExLlamaV2EbnfFilter(ExLlamaV2Filter):
def _get_lmfe_tokenizer_data(tokenizer: ExLlamaV2Tokenizer):
return build_token_enforcer_tokenizer_data(tokenizer)
def clear_grammar_func_cache():
"""Flush tokenizer_data cache to avoid holding references to tokenizers after unloading a model"""
_get_lmfe_tokenizer_data.clear_cache()
class ExLlamaV2Grammar:
"""ExLlamaV2 class for various grammar filters/parsers."""

View file

@ -26,7 +26,10 @@ from itertools import zip_longest
from loguru import logger
from typing import List, Optional, Union
from backends.exllamav2.grammar import ExLlamaV2Grammar
from backends.exllamav2.grammar import (
ExLlamaV2Grammar,
clear_grammar_func_cache,
)
from backends.exllamav2.utils import (
exllama_disabled_flash_attn,
hardware_supports_flash_attn,
@ -704,6 +707,10 @@ class ExllamaV2Container:
# Wait for other jobs to finish
await self.wait_for_jobs(kwargs.get("skip_wait"))
# Delete references held in the grammar module
clear_grammar_func_cache()
# Unload LoRAs
if self.generator and self.generator.generator.current_loras:
for lora in self.generator.generator.current_loras:
lora.unload()