Model: Add support for Q4 cache
Add this in addition to 8bit cache and 16bit cache. Passing "Q4" with the cache_mode request parameter will set this on model load. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
parent
0b25c208d6
commit
9a007c4707
2 changed files with 25 additions and 4 deletions
2
main.py
2
main.py
|
|
@ -149,7 +149,7 @@ async def get_current_model():
|
|||
rope_scale=MODEL_CONTAINER.config.scale_pos_emb,
|
||||
rope_alpha=MODEL_CONTAINER.config.scale_alpha_value,
|
||||
max_seq_len=MODEL_CONTAINER.config.max_seq_len,
|
||||
cache_mode="FP8" if MODEL_CONTAINER.cache_fp8 else "FP16",
|
||||
cache_mode=MODEL_CONTAINER.cache_mode,
|
||||
prompt_template=prompt_template.name if prompt_template else None,
|
||||
num_experts_per_token=MODEL_CONTAINER.config.num_experts_per_token,
|
||||
use_cfg=MODEL_CONTAINER.use_cfg,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue