diff --git a/OAI/types/model.py b/OAI/types/model.py index 1a2db7f..cc08742 100644 --- a/OAI/types/model.py +++ b/OAI/types/model.py @@ -15,7 +15,8 @@ class ModelList(BaseModel): class ModelLoadRequest(BaseModel): name: str max_seq_len: Optional[int] = 4096 - gpu_split: Optional[str] = "auto" + gpu_split_auto: Optional[bool] = True + gpu_split: Optional[List[float]] = Field(default_factory=list) rope_scale: Optional[float] = 1.0 rope_alpha: Optional[float] = 1.0 no_flash_attention: Optional[bool] = False diff --git a/main.py b/main.py index b5a971e..607922a 100644 --- a/main.py +++ b/main.py @@ -139,8 +139,6 @@ if __name__ == "__main__": loading_bar.finish() else: loading_bar.next() - - print("Model successfully loaded.") network_config = config["network"] uvicorn.run(app, host=network_config["host"] or "127.0.0.1", port=network_config["port"] or 8012, log_level="debug") diff --git a/model.py b/model.py index 1ea9da4..724628f 100644 --- a/model.py +++ b/model.py @@ -55,7 +55,7 @@ class ModelContainer: By default, the draft model's alpha value is calculated automatically to scale to the size of the full model. 'gpu_split_auto' (bool): Automatically split model across available devices (default: True) - 'gpu_split' (list): Allocation for weights and (some) tensors, per device + 'gpu_split' (list[float]): Allocation for weights and (some) tensors, per device 'no_flash_attn' (bool): Turns off flash attention (increases vram usage) """ @@ -63,7 +63,7 @@ class ModelContainer: self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8" self.gpu_split = kwargs.get("gpu_split", None) - self.gpu_split_auto = self.gpu_split == "auto" + self.gpu_split_auto = kwargs.get("gpu_split_auto", True) self.config = ExLlamaV2Config() self.config.model_dir = str(model_directory.resolve()) @@ -177,6 +177,8 @@ class ModelContainer: self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer, self.draft_model, self.draft_cache) + print("Model successfully loaded.") + def unload(self): """