diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 30f8a2c..52bc890 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -400,21 +400,27 @@ class ExllamaV2Container: async for value in iterate_in_threadpool(model_load_generator): yield value - # Disable paged mode if the user's min GPU is supported (ampere and above) + # Disable paged mode if the user's min GPU isn't supported (ampere and up) + device_list = { + module.device_idx + for module in self.model.modules + if module.device_idx >= 0 + } min_compute_capability = min( - set( - [ - torch.cuda.get_device_capability(device=module.device_idx)[0] - for module in self.model.modules - if module.device_idx >= 0 - ] - ) + torch.cuda.get_device_capability(device=device)[0] + for device in device_list ) + # Compute capability < 8 is not supported by FA2 + # AMD is also unsupported until ROCm updates its FA2 fork if torch.version.hip or min_compute_capability < 8: logger.warning( "An unsupported GPU is found in this configuration. " - "Switching to compatibility mode. This disables parallel batching." + "Switching to compatibility mode. \n" + "This disables parallel batching " + "and features that rely on it (ex. CFG). \n" + "To disable compatability mode, all GPUs must be ampere " + "(30 series) or newer. AMD GPUs are not supported." ) self.paged = False self.max_batch_size = 1