Merge branch 'exl3' into backend-detect
This commit is contained in:
commit
02a8d68e17
4 changed files with 68 additions and 8 deletions
|
|
@ -114,7 +114,8 @@ model:
|
|||
rope_alpha:
|
||||
|
||||
# Enable different cache modes for VRAM savings (default: FP16).
|
||||
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
|
||||
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
|
||||
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
|
||||
cache_mode: FP16
|
||||
|
||||
# Size of the prompt cache to allocate (default: max_seq_len).
|
||||
|
|
@ -164,7 +165,8 @@ draft_model:
|
|||
draft_rope_alpha:
|
||||
|
||||
# Cache mode for draft models to save VRAM (default: FP16).
|
||||
# Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
|
||||
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
|
||||
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
|
||||
draft_cache_mode: FP16
|
||||
|
||||
# An integer array of GBs of VRAM to split between GPUs (default: []).
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue