Use the same algorithm for estimating and adjusting cache size based on multiples of 256 and above max seq len. Same applies for chunk size. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
16 lines
464 B
Python
16 lines
464 B
Python
from loguru import logger
|
|
|
|
|
|
def exllama_disabled_flash_attn(no_flash_attn: bool):
|
|
unsupported_message = (
|
|
"ExllamaV2 has disabled Flash Attention. \n"
|
|
"Please see the above logs for warnings/errors. \n"
|
|
"Switching to compatibility mode. \n"
|
|
"This disables parallel batching "
|
|
"and features that rely on it (ex. CFG). \n"
|
|
)
|
|
|
|
if no_flash_attn:
|
|
logger.warning(unsupported_message)
|
|
|
|
return no_flash_attn
|