Model: Initial Exl3 cache quantization support

2025-05-01 22:55:51 -07:00 · 2025-05-01 22:55:51 -07:00 · 68a660bdb3
commit 68a660bdb3
parent 036af02bf6
4 changed files with 42 additions and 6 deletions
--- a/config_sample.yml
+++ b/config_sample.yml
@ -114,7 +114,8 @@ model:
  rope_alpha:

  # Enable different cache modes for VRAM savings (default: FP16).
-  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
  cache_mode: FP16

  # Size of the prompt cache to allocate (default: max_seq_len).
@ -164,7 +165,8 @@ draft_model:
  draft_rope_alpha:

  # Cache mode for draft models to save VRAM (default: FP16).
-  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
  draft_cache_mode: FP16

  # An integer array of GBs of VRAM to split between GPUs (default: []).