Merge pull request #254 from lucyknada/main

add draft_gpu_split option for spec decoding
2025-02-11 16:48:03 -05:00 · 2025-02-11 16:48:03 -05:00 · 2e491472d1
commit 2e491472d1
parent e290b88568 beb6d8faa5
3 changed files with 59 additions and 16 deletions
--- a/config_sample.yml
+++ b/config_sample.yml
@ -20,7 +20,7 @@ network:
  # Turn on this option if you are ONLY connecting from localhost.
  disable_auth: false

-  # Disable fetching external content in response to requests, such as images from URLs.
+  # Disable fetching external content in response to requests,such as images from URLs.
  disable_fetch_requests: false

  # Send tracebacks over the API (default: False).
@ -166,6 +166,10 @@ draft_model:
  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
  draft_cache_mode: FP16

+  # An integer array of GBs of VRAM to split between GPUs (default: []).
+  # If this isn't filled in, the draft model is autosplit.
+  draft_gpu_split: []
+
 # Options for Loras
 lora:
  # Directory to look for LoRAs (default: loras).