diff --git a/config_sample.yml b/config_sample.yml index dc36241..fe723a5 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -45,9 +45,6 @@ model: # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False) no_flash_attention: False - # Enable low vram optimizations in exllamav2 (default: False) - low_mem: False - # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16) cache_mode: FP16 diff --git a/model.py b/model.py index 25bfb25..6d3d8c7 100644 --- a/model.py +++ b/model.py @@ -81,8 +81,12 @@ class ModelContainer: self.config.scale_alpha_value = kwargs.get("rope_alpha") or self.calculate_rope_alpha(base_seq_len) if "no_flash_attn" in kwargs: self.config.no_flash_attn = kwargs["no_flash_attn"] + + # low_mem is currently broken in exllamav2. Don't use it until it's fixed. + """ if "low_mem" in kwargs and kwargs["low_mem"]: self.config.set_low_mem() + """ chunk_size = min(kwargs.get("chunk_size", 2048), self.config.max_seq_len) self.config.max_input_len = chunk_size