Merge branch 'exl3' into backend-detect

2025-05-08 23:50:33 -04:00 · 2025-05-08 23:50:33 -04:00 · 02a8d68e17
commit 02a8d68e17
parent d5963007f0 527afc206b
4 changed files with 68 additions and 8 deletions
--- a/config_sample.yml
+++ b/config_sample.yml
@ -114,7 +114,8 @@ model:
  rope_alpha:

  # Enable different cache modes for VRAM savings (default: FP16).
-  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
  cache_mode: FP16

  # Size of the prompt cache to allocate (default: max_seq_len).
@ -164,7 +165,8 @@ draft_model:
  draft_rope_alpha:

  # Cache mode for draft models to save VRAM (default: FP16).
-  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
  draft_cache_mode: FP16

  # An integer array of GBs of VRAM to split between GPUs (default: []).