Sometimes fastchat may not be able to detect the prompt template from the model path. Therefore, add the ability to set it in config.yml or via the request object itself. Also send the provided prompt template on model info request. Signed-off-by: kingbri <bdashore3@proton.me>
80 lines
No EOL
2.7 KiB
YAML
80 lines
No EOL
2.7 KiB
YAML
# Sample YAML file for configuration.
|
|
# Comment out values as needed. Every value has a default within the application.
|
|
|
|
# Unless specified in the comments, DO NOT put these options in quotes!
|
|
# You can use https://www.yamllint.com/ if you want to check your YAML formatting.
|
|
|
|
# Options for networking
|
|
network:
|
|
# The IP to host on (default: 127.0.0.1).
|
|
# Use 0.0.0.0 to expose on all network adapters
|
|
host: 127.0.0.1
|
|
|
|
# The port to host on (default: 5000)
|
|
port: 5000
|
|
|
|
# Options for model overrides and loading
|
|
model:
|
|
# Overrides the directory to look for models (default: models)
|
|
# Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.
|
|
model_dir: your model directory path
|
|
|
|
# An initial model to load. Make sure the model is located in the model directory!
|
|
# A model can be loaded later via the API.
|
|
model_name: A model name
|
|
|
|
# Sends dummy model names when the models endpoint is queried
|
|
# Enable this if the program is looking for a specific OAI model
|
|
use_dummy_models: False
|
|
|
|
# The below parameters apply only if model_name is set
|
|
|
|
# Maximum model context length (default: 4096)
|
|
max_seq_len: 4096
|
|
|
|
# Automatically allocate resources to GPUs (default: True)
|
|
gpu_split_auto: True
|
|
|
|
# An integer array of GBs of vram to split between GPUs (default: [])
|
|
gpu_split: [20.6, 24]
|
|
|
|
# Rope scaling parameters (default: 1.0)
|
|
rope_scale: 1.0
|
|
rope_alpha: 1.0
|
|
|
|
# Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)
|
|
no_flash_attention: False
|
|
|
|
# Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)
|
|
cache_mode: FP16
|
|
|
|
# Set the prompt template for this model. If empty, fastchat will automatically find the best template to use (default: None)
|
|
# NOTE: Only works with chat completion message lists!
|
|
prompt_template:
|
|
|
|
# Options for draft models (speculative decoding). This will use more VRAM!
|
|
draft:
|
|
# Overrides the directory to look for draft (default: models)
|
|
draft_model_dir: Your draft model directory path
|
|
|
|
# An initial draft model to load. Make sure this model is located in the model directory!
|
|
# A draft model can be loaded later via the API.
|
|
draft_model_name: A model name
|
|
|
|
# Rope parameters for draft models (default: 1.0)
|
|
draft_rope_scale: 1.0
|
|
draft_rope_alpha: 1.0
|
|
|
|
# Options for loras
|
|
lora:
|
|
# Overrides the directory to look for loras (default: loras)
|
|
lora_dir: Your lora directory path
|
|
|
|
# List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.
|
|
loras:
|
|
- name: lora1
|
|
scaling: 1.0
|
|
- name: lora2
|
|
scaling: 0.9
|
|
- name: lora3
|
|
scaling: 0.5 |