Update dependencies, support Python 3.12, update for exl2 0.1.5 (#134)

* Dependencies: Add wheels for Python 3.12 * Model: Switch fp8 cache to Q8 cache * Model: Add ability to set draft model cache mode * Dependencies: Bump exllamav2 to 0.1.5 * Model: Support Q6 cache * Config: Add Q6 cache and draft_cache_mode to config sample
2024-06-09 08:27:39 -07:00 · 2024-06-09 08:27:39 -07:00 · 55d979b7a5
commit 55d979b7a5
parent dcd9428325
5 changed files with 84 additions and 33 deletions
--- a/config_sample.yml
+++ b/config_sample.yml
@ -101,7 +101,7 @@ model:
  #rope_alpha: 1.0

  # Enable different cache modes for VRAM savings (slight performance hit).
-  # Possible values FP16, FP8, Q4. (default: FP16)
+  # Possible values FP16, Q8, Q6, Q4. (default: FP16)
  #cache_mode: FP16

  # Size of the prompt cache to allocate (default: max_seq_len)
@ -152,6 +152,10 @@ model:
    # Same thing as alpha_value
    # Leave blank to automatically calculate alpha value
    #draft_rope_alpha: 1.0
+
+    # Enable different draft model cache modes for VRAM savings (slight performance hit).
+    # Possible values FP16, Q8, Q6, Q4. (default: FP16)
+    #draft_cache_mode: FP16
  
  # Options for loras
  #lora: