Model: Initial Exl3 cache quantization support

This commit is contained in:
DocShotgun 2025-05-01 22:55:51 -07:00
parent 036af02bf6
commit 68a660bdb3
4 changed files with 42 additions and 6 deletions

View file

@ -114,7 +114,8 @@ model:
rope_alpha:
# Enable different cache modes for VRAM savings (default: FP16).
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
cache_mode: FP16
# Size of the prompt cache to allocate (default: max_seq_len).
@ -164,7 +165,8 @@ draft_model:
draft_rope_alpha:
# Cache mode for draft models to save VRAM (default: FP16).
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
draft_cache_mode: FP16
# An integer array of GBs of VRAM to split between GPUs (default: []).