Model: Format

Signed-off-by: kingbri <bdashore3@proton.me>
2023-12-09 22:38:06 -05:00 · 2023-12-09 22:38:06 -05:00 · 0f4290f05c
commit 0f4290f05c
parent 5ae2a91c04
2 changed files with 3 additions and 30 deletions
--- a/main.py
+++ b/main.py
@ -120,7 +120,7 @@ async def load_model(request: Request, data: ModelLoadRequest):
                    break

                if module == 0:
-                    loading_bar: IncrementalBar = IncrementalBar("Modules", max = modules)  
+                    loading_bar: IncrementalBar = IncrementalBar("Modules", max = modules)
                elif module == modules:
                    loading_bar.next()
                    loading_bar.finish()
@ -215,8 +215,6 @@ async def load_model(data: LoraLoadRequest):
 # Unload lora endpoint
@app.get("/v1/lora/unload", dependencies=[Depends(check_admin_key), Depends(_check_model_container)])
 async def unload_loras():
-    global model_container
-
    model_container.unload(True)

 # Encode tokens endpoint
--- a/model.py
+++ b/model.py
@ -19,7 +19,6 @@ from utils import coalesce, unwrap
 auto_split_reserve_bytes = 96 * 1024**2

 class ModelContainer:
-
    config: Optional[ExLlamaV2Config] = None
    draft_config: Optional[ExLlamaV2Config] = None
    model: Optional[ExLlamaV2] = None
@ -32,7 +31,7 @@ class ModelContainer:
    cache_fp8: bool = False
    gpu_split_auto: bool = True
    gpu_split: Optional[list] = None
-    
+
    active_loras: List[ExLlamaV2Lora] = []

    def __init__(self, model_directory: pathlib.Path, quiet = False, **kwargs):
@ -108,7 +107,6 @@ class ModelContainer:
            enable_draft = False

        if enable_draft:
-
            self.draft_config = ExLlamaV2Config()
            draft_model_path = pathlib.Path(unwrap(draft_args.get("draft_model_dir"), "models"))
            draft_model_path = draft_model_path / draft_model_name
@ -124,7 +122,6 @@ class ModelContainer:
                self.draft_config.max_input_len = kwargs["chunk_size"]
                self.draft_config.max_attn_size = kwargs["chunk_size"] ** 2

-
    def calculate_rope_alpha(self, base_seq_len):
        ratio = self.config.max_seq_len / base_seq_len

@ -136,7 +133,6 @@ class ModelContainer:
        model_path = pathlib.Path(self.config.model_dir)
        return model_path

-
    def load(self, progress_callback = None):
        """
        Load model
@ -184,13 +180,10 @@ class ModelContainer:
        """

        # Load tokenizer
-
        self.tokenizer = ExLlamaV2Tokenizer(self.config)

        # Load draft model if a config is present
-
        if self.draft_config:
-
            self.draft_model = ExLlamaV2(self.draft_config)
            if not self.quiet:
                print("Loading draft model: " + self.draft_config.model_dir)
@ -200,12 +193,10 @@ class ModelContainer:
            yield from self.draft_model.load_autosplit_gen(self.draft_cache, reserve_vram = reserve, last_id_only = True, callback_gen = progress_callback)

            # Test VRAM allocation with a full-length forward pass
-
            input_ids = torch.zeros((1, self.config.max_input_len), dtype = torch.long)
            self.draft_model.forward(input_ids, cache = self.cache, preprocess_only = True)

        # Load model
-
        self.model = ExLlamaV2(self.config)
        if not self.quiet:
            print("Loading model: " + self.config.model_dir)
@ -225,12 +216,10 @@ class ModelContainer:
            yield from self.model.load_autosplit_gen(self.cache, reserve_vram = reserve, last_id_only = True, callback_gen = progress_callback)

        # Test VRAM allocation with a full-length forward pass
-
        input_ids = torch.zeros((1, self.config.max_input_len), dtype = torch.long)
        self.model.forward(input_ids, cache = self.cache, preprocess_only = True)

        # Create generator
-
        self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer, self.draft_model, self.draft_cache)

        print("Model successfully loaded.")
@ -274,7 +263,6 @@ class ModelContainer:
            ids = torch.tensor([ids])
            return self.tokenizer.decode(ids, decode_special_tokens = unwrap(kwargs.get("decode_special_tokens"), True))[0]

-
    def generate(self, prompt: str, **kwargs):
        gen = list(self.generate_gen(prompt, **kwargs))
        reponse = "".join(map(lambda o: o[0], gen))
@ -318,11 +306,9 @@ class ModelContainer:
        generate_window = min(unwrap(kwargs.get("generate_window"), 512), max_tokens)

        # Sampler settings
-
        gen_settings = ExLlamaV2Sampler.Settings()

        # Warn of unsupported settings if the setting is enabled
-
        if (unwrap(kwargs.get("mirostat"), False)) and not hasattr(gen_settings, "mirostat"):
            print(" !! Warning: Currently installed ExLlamaV2 does not support Mirostat sampling")

@ -335,8 +321,7 @@ class ModelContainer:
        if (unwrap(kwargs.get("temperature_last"), False)) and not hasattr(gen_settings, "temperature_last"):
            print(" !! Warning: Currently installed ExLlamaV2 does not support temperature_last")

-        #Apply settings
-
+        # Apply settings
        gen_settings.temperature = unwrap(kwargs.get("temperature"), 1.0)
        gen_settings.temperature_last = unwrap(kwargs.get("temperature_last"), False)
        gen_settings.top_k = unwrap(kwargs.get("top_k"), 0)
@ -363,14 +348,12 @@ class ModelContainer:


        # Ban the EOS token if specified. If not, append to stop conditions as well.
-
        if ban_eos_token:
            gen_settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
        else:
            stop_conditions.append(self.tokenizer.eos_token_id)

        # Override sampler settings for temp = 0
-
        if gen_settings.temperature == 0:
            gen_settings.temperature = 1.0
            gen_settings.top_k = 1
@ -378,11 +361,9 @@ class ModelContainer:
            gen_settings.typical = 0

        # Stop conditions           
-
        self.generator.set_stop_conditions(stop_conditions)

        # Tokenized context
-
        ids = self.tokenizer.encode(
            prompt,
            add_bos = unwrap(kwargs.get("add_bos_token"), True),
@ -399,7 +380,6 @@ class ModelContainer:
        prompt_tokens = ids.shape[-1]

        # Begin
-
        generated_tokens = 0
        full_response = ""
        start_time = time.time()
@ -410,11 +390,8 @@ class ModelContainer:
        chunk_tokens = 0

        while True:
-
            # Ingest prompt
-
            if chunk_tokens == 0:
-
                ids = torch.cat((ids, save_tokens), dim = - 1)
                save_tokens = torch.empty((1, 0), dtype = torch.bool)
                overflow = ids.shape[-1] + generate_window - self.config.max_seq_len
@ -424,7 +401,6 @@ class ModelContainer:
                self.generator.begin_stream(active_ids, gen_settings, token_healing = token_healing, loras = self.active_loras)

            # Generate
-
            chunk, eos, tokens = self.generator.stream()

            if token_healing:
@ -438,7 +414,6 @@ class ModelContainer:
            chunk_tokens -= 1

            # Yield output
-
            now = time.time()
            elapsed = now - last_chunk_time