diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 57f2a09..798ef2f 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -11,8 +11,9 @@ from exllamav2 import ( ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, - ExLlamaV2Cache_8bit, ExLlamaV2Cache_Q4, + ExLlamaV2Cache_Q6, + ExLlamaV2Cache_Q8, ExLlamaV2Tokenizer, ExLlamaV2Lora, ) @@ -60,6 +61,7 @@ class ExllamaV2Container: # Internal config vars cache_size: int = None cache_mode: str = "FP16" + draft_cache_mode: str = "FP16" max_batch_size: int = 20 generation_config: Optional[GenerationConfig] = None @@ -91,7 +93,7 @@ class ExllamaV2Container: def progress(loaded_modules: int, total_modules: int, loading_draft: bool) **kwargs: - `cache_mode` (str): Sets cache mode, "FP16" or "FP8" + `cache_mode` (str): Sets cache mode: "FP16"/"Q8"/"Q6"/"Q4" (default: "FP16") 'max_seq_len' (int): Override model's default max sequence length (default: 4096) @@ -116,6 +118,8 @@ class ExllamaV2Container: model. By default, the draft model's alpha value is calculated automatically to scale to the size of the full model. + 'draft_cache_mode' (str): Sets draft cache mode: "FP16"/"Q8"/"Q6"/"Q4" + (default: "FP16") 'lora_dir' (str): LoRA directory 'loras' (list[dict]): List of loras to be loaded, consisting of 'name' and 'scaling' @@ -373,6 +377,7 @@ class ExllamaV2Container: self.calculate_rope_alpha(self.draft_config.max_seq_len), ) self.draft_config.max_seq_len = self.config.max_seq_len + self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") if chunk_size: self.draft_config.max_input_len = chunk_size @@ -460,6 +465,7 @@ class ExllamaV2Container: "rope_scale": self.draft_config.scale_pos_emb, "rope_alpha": self.draft_config.scale_alpha_value, "max_seq_len": self.draft_config.max_seq_len, + "cache_mode": self.draft_cache_mode, } model_params["draft"] = draft_model_params @@ -571,11 +577,30 @@ class ExllamaV2Container: if not self.quiet: logger.info("Loading draft model: " + self.draft_config.model_dir) - self.draft_cache = ExLlamaV2Cache( - self.draft_model, - max_seq_len=self.cache_size, - lazy=True, - ) + if self.draft_cache_mode == "Q4": + self.draft_cache = ExLlamaV2Cache_Q4( + self.draft_model, + max_seq_len=self.cache_size, + lazy=True, + ) + elif self.draft_cache_mode == "Q6": + self.draft_cache = ExLlamaV2Cache_Q6( + self.draft_model, + max_seq_len=self.cache_size, + lazy=True, + ) + elif self.draft_cache_mode == "Q8": + self.draft_cache = ExLlamaV2Cache_Q8( + self.draft_model, + max_seq_len=self.cache_size, + lazy=True, + ) + else: + self.draft_cache = ExLlamaV2Cache( + self.draft_model, + max_seq_len=self.cache_size, + lazy=True, + ) for value in self.draft_model.load_autosplit_gen( self.draft_cache, reserve_vram=autosplit_reserve, @@ -612,8 +637,15 @@ class ExllamaV2Container: lazy=self.gpu_split_auto, batch_size=1, ) - elif self.cache_mode == "FP8": - self.cache = ExLlamaV2Cache_8bit( + elif self.cache_mode == "Q6": + self.cache = ExLlamaV2Cache_Q6( + self.model, + max_seq_len=self.cache_size, + lazy=self.gpu_split_auto, + batch_size=1, + ) + elif self.cache_mode == "Q8": + self.cache = ExLlamaV2Cache_Q8( self.model, max_seq_len=self.cache_size, lazy=self.gpu_split_auto, diff --git a/backends/exllamav2/utils.py b/backends/exllamav2/utils.py index aa78a68..0c0e2cf 100644 --- a/backends/exllamav2/utils.py +++ b/backends/exllamav2/utils.py @@ -6,7 +6,7 @@ from loguru import logger def check_exllama_version(): """Verifies the exllama version""" - required_version = version.parse("0.1.4") + required_version = version.parse("0.1.5") current_version = version.parse(package_version("exllamav2").split("+")[0]) if current_version < required_version: diff --git a/config_sample.yml b/config_sample.yml index 6cdec10..1c0304d 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -101,7 +101,7 @@ model: #rope_alpha: 1.0 # Enable different cache modes for VRAM savings (slight performance hit). - # Possible values FP16, FP8, Q4. (default: FP16) + # Possible values FP16, Q8, Q6, Q4. (default: FP16) #cache_mode: FP16 # Size of the prompt cache to allocate (default: max_seq_len) @@ -152,6 +152,10 @@ model: # Same thing as alpha_value # Leave blank to automatically calculate alpha value #draft_rope_alpha: 1.0 + + # Enable different draft model cache modes for VRAM savings (slight performance hit). + # Possible values FP16, Q8, Q6, Q4. (default: FP16) + #draft_cache_mode: FP16 # Options for loras #lora: diff --git a/endpoints/OAI/types/model.py b/endpoints/OAI/types/model.py index 1a98c17..8847680 100644 --- a/endpoints/OAI/types/model.py +++ b/endpoints/OAI/types/model.py @@ -53,6 +53,7 @@ class DraftModelLoadRequest(BaseModel): default=None, examples=[1.0], ) + draft_cache_mode: Optional[str] = "FP16" class ModelLoadRequest(BaseModel): diff --git a/pyproject.toml b/pyproject.toml index 5dc6553..0af8660 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,54 +47,68 @@ dev = [ ] cu121 = [ # Torch (Extra index URLs not support in pyproject.toml) - "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Windows FA2 from https://github.com/bdashore3/flash-attention/releases - "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases + "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", ] cu118 = [ # Torch - "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases + "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", ] amd = [ # Torch triton for ROCm + "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", # Torch - "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", + "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", + "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", + "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+rocm6.0.torch2.3.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+rocm6.0.torch2.3.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", ] # MARK: Ruff options