Merge branch 'exl3' into backend-detect

This commit is contained in:
Brian 2025-05-08 23:50:33 -04:00 committed by GitHub
commit 02a8d68e17
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 68 additions and 8 deletions

View file

@ -114,7 +114,8 @@ model:
rope_alpha:
# Enable different cache modes for VRAM savings (default: FP16).
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
cache_mode: FP16
# Size of the prompt cache to allocate (default: max_seq_len).
@ -164,7 +165,8 @@ draft_model:
draft_rope_alpha:
# Cache mode for draft models to save VRAM (default: FP16).
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
draft_cache_mode: FP16
# An integer array of GBs of VRAM to split between GPUs (default: []).