From a3c780ae581107dc6f74ed1ca019e7a63494ade1 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Sun, 25 May 2025 22:15:21 -0400 Subject: [PATCH 1/5] API: Core: Remove load/template aliases These added extra complexity and should be removed and replaced with a single parameter. Changes: - /v1/model/load must use model_name and draft_model_name - /v1/model/embedding/load must use embedding_model_name - /v1/template/switch must use prompt_template_name Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- endpoints/core/types/model.py | 22 +++++----------------- endpoints/core/types/template.py | 7 ++----- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index e01b603..8e50622 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -1,6 +1,6 @@ """Contains model card types.""" -from pydantic import AliasChoices, BaseModel, Field, ConfigDict +from pydantic import BaseModel, Field, ConfigDict from time import time from typing import List, Literal, Optional, Union @@ -50,10 +50,7 @@ class DraftModelLoadRequest(BaseModel): """Represents a draft model load request.""" # Required - draft_model_name: str = Field( - alias=AliasChoices("draft_model_name", "name"), - description="Aliases: name", - ) + draft_model_name: str # Config arguments draft_rope_scale: Optional[float] = None @@ -75,10 +72,7 @@ class ModelLoadRequest(BaseModel): model_config = ConfigDict(protected_namespaces=[]) # Required - model_name: str = Field( - alias=AliasChoices("model_name", "name"), - description="Aliases: name", - ) + model_name: str # Config arguments backend: Optional[str] = Field( @@ -118,18 +112,12 @@ class ModelLoadRequest(BaseModel): vision: Optional[bool] = None # Non-config arguments - draft_model: Optional[DraftModelLoadRequest] = Field( - default=None, - alias=AliasChoices("draft_model", "draft"), - ) + draft_model: Optional[DraftModelLoadRequest] = None skip_queue: Optional[bool] = False class EmbeddingModelLoadRequest(BaseModel): - embedding_model_name: str = Field( - alias=AliasChoices("embedding_model_name", "name"), - description="Aliases: name", - ) + embedding_model_name: str # Set default from the config embeddings_device: Optional[str] = Field(config.embeddings.embeddings_device) diff --git a/endpoints/core/types/template.py b/endpoints/core/types/template.py index 010c9db..a82ef48 100644 --- a/endpoints/core/types/template.py +++ b/endpoints/core/types/template.py @@ -1,4 +1,4 @@ -from pydantic import AliasChoices, BaseModel, Field +from pydantic import BaseModel, Field from typing import List @@ -12,7 +12,4 @@ class TemplateList(BaseModel): class TemplateSwitchRequest(BaseModel): """Request to switch a template.""" - prompt_template_name: str = Field( - alias=AliasChoices("prompt_template_name", "name"), - description="Aliases: name", - ) + prompt_template_name: str From 322f9b773ac41e9e996c40c2cc1fdc6373bb9248 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Mon, 26 May 2025 20:51:28 -0400 Subject: [PATCH 2/5] Model: Migrate inline config to new format This matches config.yml and all model overrides should go under the "model" block. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- common/model.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/common/model.py b/common/model.py index 44a35fa..1935025 100644 --- a/common/model.py +++ b/common/model.py @@ -76,6 +76,9 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs): if not override_config_path.exists(): return kwargs + # Initialize overrides dict + overrides = {} + async with aiofiles.open( override_config_path, "r", encoding="utf8" ) as override_config_file: @@ -83,18 +86,25 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs): # Create a temporary YAML parser yaml = YAML(typ="safe") - override_args = unwrap(yaml.load(contents), {}) + inline_config = unwrap(yaml.load(contents), {}) + + # Check for inline model overrides + model_inline_config = unwrap(inline_config.get("model"), {}) + if model_inline_config: + overrides = {**model_inline_config} + else: + logger.warning( + "Cannot find inline model overrides. " + "Make sure they are nested under a \"model:\" key" + ) # Merge draft overrides beforehand - draft_override_args = unwrap(override_args.get("draft_model"), {}) - if draft_override_args: - kwargs["draft_model"] = { - **draft_override_args, - **unwrap(kwargs.get("draft_model"), {}), - } + draft_inline_config = unwrap(inline_config.get("draft_model"), {}) + if draft_inline_config: + overrides["draft_model"] = {**draft_inline_config} # Merge the override and model kwargs - merged_kwargs = {**override_args, **kwargs} + merged_kwargs = {**overrides, **kwargs} return merged_kwargs From 2096c9bad2d912b67e2cf95eadd6c1f1c8b00716 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:12:03 -0400 Subject: [PATCH 3/5] Model: Default max_seq_len to 4096 A common problem in TabbyAPI is that users who want to get up and running with a model always had issues with max_seq_len causing OOMs. This is because model devs set max context values in the millions which requires a lot of VRAM. To idiot-proof first time setup, make the fallback default 4096 so users can run their models. If a user still wants to use the model's max_seq_len, set it to -1. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav2/model.py | 10 ++++++---- common/config_models.py | 6 +++--- common/model.py | 9 ++++++++- common/transformers_utils.py | 3 +-- config_sample.yml | 4 ++-- 5 files changed, 20 insertions(+), 12 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 29caa34..fd23a28 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -235,11 +235,10 @@ class ExllamaV2Container(BaseModelContainer): # Grab the base model's sequence length before overrides for # rope calculations - base_seq_len = self.config.max_seq_len + base_seq_len = hf_model.hf_config.max_position_embeddings # Set the target seq len if present - # Fallback to base_seq_len if not provided - target_seq_len = unwrap(kwargs.get("max_seq_len"), base_seq_len) + target_seq_len = unwrap(kwargs.get("max_seq_len"), 4096) # Set the rope scale self.config.scale_pos_emb = unwrap( @@ -247,6 +246,7 @@ class ExllamaV2Container(BaseModelContainer): ) # Sets rope alpha value. + # Utilize the model's max_position_embeddings as a base value # Automatically calculate if unset or defined as an "auto" literal. rope_alpha = unwrap(kwargs.get("rope_alpha"), "auto") if rope_alpha == "auto": @@ -371,7 +371,7 @@ class ExllamaV2Container(BaseModelContainer): ) # Set draft rope alpha. Follows same behavior as model rope alpha. - # Use the base sequence length of the model + # Use the max_position_embeddings of the model draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto") if draft_rope_alpha == "auto": self.draft_config.scale_alpha_value = calculate_rope_alpha( @@ -399,6 +399,8 @@ class ExllamaV2Container(BaseModelContainer): self.draft_config.max_input_len = chunk_size self.draft_config.max_attention_size = chunk_size**2 + print(self.config.max_seq_len) + # Return the created instance return self diff --git a/common/config_models.py b/common/config_models.py index 0958a8e..46a7b5e 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -175,10 +175,10 @@ class ModelConfig(BaseConfigModel): max_seq_len: Optional[int] = Field( None, description=( - "Max sequence length (default: Empty).\n" - "Fetched from the model's base sequence length in config.json by default." + "Max sequence length (default: 4096).\n" + "Set to -1 to fetch from the model's config.json" ), - ge=0, + ge=-1, ) tensor_parallel: Optional[bool] = Field( False, diff --git a/common/model.py b/common/model.py index 1935025..6534951 100644 --- a/common/model.py +++ b/common/model.py @@ -95,7 +95,7 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs): else: logger.warning( "Cannot find inline model overrides. " - "Make sure they are nested under a \"model:\" key" + 'Make sure they are nested under a "model:" key' ) # Merge draft overrides beforehand @@ -148,6 +148,13 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): # Fetch the extra HF configuration options hf_model = await HFModel.from_directory(model_path) + # Override the max sequence length based on user + max_seq_len = kwargs.get("max_seq_len") + if max_seq_len == -1: + kwargs["max_seq_len"] = hf_model.hf_config.max_position_embeddings + elif max_seq_len is None: + kwargs["max_seq_len"] = 4096 + # Create a new container and check if the right dependencies are installed backend = unwrap(kwargs.get("backend"), detect_backend(hf_model)) container_class = _BACKEND_REGISTRY.get(backend) diff --git a/common/transformers_utils.py b/common/transformers_utils.py index 6cac3b8..a7b0f0c 100644 --- a/common/transformers_utils.py +++ b/common/transformers_utils.py @@ -39,12 +39,11 @@ class GenerationConfig(BaseModel): class HuggingFaceConfig(BaseModel): """ - DEPRECATED: Currently a stub and doesn't do anything. - An abridged version of HuggingFace's model config. Will be expanded as needed. """ + max_position_embeddings: int = 4096 eos_token_id: Optional[Union[int, List[int]]] = None quantization_config: Optional[Dict] = None diff --git a/config_sample.yml b/config_sample.yml index ffe2605..1ee82a9 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -78,8 +78,8 @@ model: # Options: exllamav2, exllamav3 backend: - # Max sequence length (default: Empty). - # Fetched from the model's base sequence length in config.json by default. + # Max sequence length (default: 4096). + # Set to -1 to fetch from the model's config.json max_seq_len: # Load model with tensor parallelism. From 2913ce29fc32e25fe93af6c1d06d3d69705fe1be Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Tue, 17 Jun 2025 22:54:51 -0400 Subject: [PATCH 4/5] API: Add timings to usage stats It's useful for the client to know what the T/s and total time for generation are per-request. Works with both completions and chat completions. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav2/model.py | 52 +++++++++++++++++--------- backends/exllamav3/model.py | 50 +++++++++++++++++-------- common/gen_logging.py | 31 +++++---------- endpoints/OAI/types/common.py | 7 +++- endpoints/OAI/utils/chat_completion.py | 23 +++++++++--- endpoints/OAI/utils/completion.py | 10 ++++- 6 files changed, 110 insertions(+), 63 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 091043a..18a06c9 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -913,7 +913,7 @@ class ExllamaV2Container(BaseModelContainer): joined_generation = { "text": "", "prompt_tokens": 0, - "generation_tokens": 0, + "gen_tokens": 0, "tool_calls": None, "offset": [], "token_probs": {}, @@ -923,11 +923,8 @@ class ExllamaV2Container(BaseModelContainer): if generations: # Get finish_reason first and then shift where -1 points to if "finish_reason" in generations[-1]: - finish_reason_gen = generations.pop() - joined_generation["finish_reason"] = finish_reason_gen.get( - "finish_reason" - ) - joined_generation["stop_str"] = finish_reason_gen.get("stop_str") + finish_chunk = generations.pop() + joined_generation = {**joined_generation, **finish_chunk} else: joined_generation["finish_reason"] = "stop" @@ -1189,9 +1186,35 @@ class ExllamaV2Container(BaseModelContainer): elif eos_reason == "stop_string": stop_str = result.get("eos_triggering_string") + # Prompt + prompt_tokens = result.get("prompt_tokens") + cached_tokens = round(result.get("cached_tokens"), 2) + prompt_time = round(result.get("time_prefill"), 2) + prompt_ts = ( + "Indeterminate" + if prompt_time == 0 + else round((prompt_tokens - cached_tokens) / prompt_time, 2) + ) + + # Generated + gen_tokens = result.get("new_tokens") + gen_time = result.get("time_generate") + gen_ts = "Indeterminate" if gen_time == 0 else round(gen_tokens / gen_time, 2) + + # Queue + Total + queue_time = result.get("time_enqueued") + total_time = round(queue_time + prompt_time + gen_time, 2) + finish_chunk = { - "prompt_tokens": generation.get("prompt_tokens"), - "generated_tokens": generation.get("generated_tokens"), + "prompt_tokens": prompt_tokens, + "prompt_time": round(prompt_time, 2), + "prompt_tokens_per_sec": prompt_ts, + "gen_tokens": gen_tokens, + "gen_time": round(gen_time, 2), + "gen_tokens_per_sec": gen_ts, + "total_time": total_time, + "queue_time": round(queue_time, 2), + "cached_tokens": cached_tokens, "finish_reason": finish_reason, "stop_str": stop_str, } @@ -1413,12 +1436,12 @@ class ExllamaV2Container(BaseModelContainer): if result.get("eos"): log_response(request_id, full_response) - generation = self.handle_finish_chunk(result, generation) + finish_chunk = self.handle_finish_chunk(result, generation) # Save the final result for metrics logging - metrics_result = result + metrics_result = finish_chunk - yield generation + yield finish_chunk break except asyncio.CancelledError: await job.cancel() @@ -1451,12 +1474,7 @@ class ExllamaV2Container(BaseModelContainer): if metrics_result: log_metrics( request_id, - metrics_result.get("time_enqueued"), - metrics_result.get("prompt_tokens"), - metrics_result.get("cached_tokens"), - metrics_result.get("time_prefill"), - metrics_result.get("new_tokens"), - metrics_result.get("time_generate"), + metrics_result, context_len, max_seq_len, ) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index ee2dfaa..2cfd196 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -649,11 +649,8 @@ class ExllamaV3Container(BaseModelContainer): if generations: # Get finish_reason first and then shift where -1 points to if "finish_reason" in generations[-1]: - finish_reason_gen = generations.pop() - joined_generation["finish_reason"] = finish_reason_gen.get( - "finish_reason" - ) - joined_generation["stop_str"] = finish_reason_gen.get("stop_str") + finish_chunk = generations.pop() + joined_generation = {**joined_generation, **finish_chunk} else: joined_generation["finish_reason"] = "stop" @@ -743,9 +740,35 @@ class ExllamaV3Container(BaseModelContainer): elif eos_reason == "stop_string": stop_str = result.get("eos_triggering_string") + # Prompt + prompt_tokens = result.get("prompt_tokens") + cached_tokens = round(result.get("cached_tokens"), 2) + prompt_time = round(result.get("time_prefill"), 2) + prompt_ts = ( + "Indeterminate" + if prompt_time == 0 + else round((prompt_tokens - cached_tokens) / prompt_time, 2) + ) + + # Generated + gen_tokens = result.get("new_tokens") + gen_time = result.get("time_generate") + gen_ts = "Indeterminate" if gen_time == 0 else round(gen_tokens / gen_time, 2) + + # Queue + Total + queue_time = result.get("time_enqueued") + total_time = round(queue_time + prompt_time + gen_time, 2) + finish_chunk = { - "prompt_tokens": generation.get("prompt_tokens"), - "generated_tokens": generation.get("generated_tokens"), + "prompt_tokens": prompt_tokens, + "prompt_time": round(prompt_time, 2), + "prompt_tokens_per_sec": prompt_ts, + "gen_tokens": gen_tokens, + "gen_time": round(gen_time, 2), + "gen_tokens_per_sec": gen_ts, + "total_time": total_time, + "queue_time": round(queue_time, 2), + "cached_tokens": cached_tokens, "finish_reason": finish_reason, "stop_str": stop_str, } @@ -921,12 +944,12 @@ class ExllamaV3Container(BaseModelContainer): yield generation if result.get("eos"): - generation = self.handle_finish_chunk(result, generation) + finish_chunk = self.handle_finish_chunk(result, generation) # Save the final result for metrics logging - metrics_result = result + metrics_result = finish_chunk - yield generation + yield finish_chunk break # Assign the active job to the request ID self.active_job_ids[request_id] = job @@ -962,12 +985,7 @@ class ExllamaV3Container(BaseModelContainer): if metrics_result: log_metrics( request_id, - metrics_result.get("time_enqueued"), - metrics_result.get("prompt_tokens"), - metrics_result.get("cached_tokens"), - metrics_result.get("time_prefill"), - metrics_result.get("new_tokens"), - metrics_result.get("time_generate"), + metrics_result, context_len, self.max_seq_len, ) diff --git a/common/gen_logging.py b/common/gen_logging.py index 490d257..fcd3c01 100644 --- a/common/gen_logging.py +++ b/common/gen_logging.py @@ -54,40 +54,29 @@ def log_response(request_id: str, response: str): def log_metrics( request_id: str, - queue_time: float, - prompt_tokens: int, - cached_tokens: int, - prompt_time: float, - generated_tokens: int, - generate_time: float, + metrics: dict, context_len: Optional[int], max_seq_len: int, ): initial_response = ( - f"Metrics (ID: {request_id}): {generated_tokens} tokens generated in " - f"{round(queue_time + prompt_time + generate_time, 2)} seconds" + f"Metrics (ID: {request_id}): {metrics.get('gen_tokens')} " + f"tokens generated in {metrics.get('total_time')} seconds" ) itemization = [] extra_parts = [] - itemization.append(f"Queue: {round(queue_time, 2)} s") + itemization.append(f"Queue: {metrics.get('queue_time')} s") + + cached_tokens = metrics.get("cached_tokens") + prompt_tokens = metrics.get("prompt_tokens") - prompt_ts = ( - "Indeterminate" - if prompt_time == 0 - else round((prompt_tokens - cached_tokens) / prompt_time, 2) - ) itemization.append( f"Process: {cached_tokens} cached tokens and " - f"{prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s" + f"{prompt_tokens - cached_tokens} new tokens at " + f"{metrics.get('prompt_tokens_per_sec')} T/s" ) - generate_ts = ( - "Indeterminate" - if generate_time == 0 - else round(generated_tokens / generate_time, 2) - ) - itemization.append(f"Generate: {generate_ts} T/s") + itemization.append(f"Generate: {metrics.get('gen_tokens_per_sec')} T/s") # Add context (original token count) if context_len: diff --git a/endpoints/OAI/types/common.py b/endpoints/OAI/types/common.py index 9c90188..16ef2ed 100644 --- a/endpoints/OAI/types/common.py +++ b/endpoints/OAI/types/common.py @@ -1,7 +1,7 @@ """Common types for OAI.""" from pydantic import BaseModel, Field -from typing import Optional +from typing import Optional, Union from common.sampling import BaseSamplerRequest, get_default_sampler_value @@ -10,8 +10,13 @@ class UsageStats(BaseModel): """Represents usage stats.""" prompt_tokens: int + prompt_time: Optional[float] = None + prompt_tokens_per_sec: Optional[Union[float, str]] = None completion_tokens: int + completion_time: Optional[float] = None + completion_tokens_per_sec: Optional[Union[float, str]] = None total_tokens: int + total_time: Optional[float] = None class CompletionResponseFormat(BaseModel): diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index 7eb5bfe..f9a556f 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -38,9 +38,6 @@ def _create_response( ): """Create a chat completion response from the provided text.""" - prompt_tokens = unwrap(generations[-1].get("prompt_tokens"), 0) - completion_tokens = unwrap(generations[-1].get("generated_tokens"), 0) - choices = [] for index, generation in enumerate(generations): message = ChatCompletionMessage( @@ -91,14 +88,23 @@ def _create_response( choices.append(choice) + final_generation = generations[-1] + prompt_tokens = unwrap(final_generation.get("prompt_tokens"), 0) + completion_tokens = unwrap(final_generation.get("gen_tokens"), 0) + response = ChatCompletionResponse( - id=f"chatcmpl-{request_id}", + id=f"cmpl-{request_id}", choices=choices, - model=unwrap(model_name, ""), + model=model_name, usage=UsageStats( prompt_tokens=prompt_tokens, + prompt_time=final_generation.get("prompt_time"), + prompt_tokens_per_sec=final_generation.get("prompt_tokens_per_sec"), completion_tokens=completion_tokens, + completion_time=final_generation.get("gen_time"), + completion_tokens_per_sec=final_generation.get("gen_tokens_per_sec"), total_tokens=prompt_tokens + completion_tokens, + total_time=final_generation.get("total_time"), ), ) @@ -119,12 +125,17 @@ def _create_stream_chunk( if is_usage_chunk: prompt_tokens = unwrap(generation.get("prompt_tokens"), 0) - completion_tokens = unwrap(generation.get("generated_tokens"), 0) + completion_tokens = unwrap(generation.get("gen_tokens"), 0) usage_stats = UsageStats( prompt_tokens=prompt_tokens, + prompt_time=generation.get("prompt_time"), + prompt_tokens_per_sec=generation.get("prompt_tokens_per_sec"), completion_tokens=completion_tokens, + completion_time=generation.get("gen_time"), + completion_tokens_per_sec=generation.get("gen_tokens_per_sec"), total_tokens=prompt_tokens + completion_tokens, + total_time=generation.get("total_time"), ) elif "finish_reason" in generation: # Get the finish reason from the generation diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py index f6c31cb..ca51c9c 100644 --- a/endpoints/OAI/utils/completion.py +++ b/endpoints/OAI/utils/completion.py @@ -73,8 +73,9 @@ def _create_response( choices.append(choice) - prompt_tokens = unwrap(generations[-1].get("prompt_tokens"), 0) - completion_tokens = unwrap(generations[-1].get("generated_tokens"), 0) + final_generation = generations[-1] + prompt_tokens = unwrap(final_generation.get("prompt_tokens"), 0) + completion_tokens = unwrap(final_generation.get("gen_tokens"), 0) response = CompletionResponse( id=f"cmpl-{request_id}", @@ -82,8 +83,13 @@ def _create_response( model=model_name, usage=UsageStats( prompt_tokens=prompt_tokens, + prompt_time=final_generation.get("prompt_time"), + prompt_tokens_per_sec=final_generation.get("prompt_tokens_per_sec"), completion_tokens=completion_tokens, + completion_time=final_generation.get("gen_time"), + completion_tokens_per_sec=final_generation.get("gen_tokens_per_sec"), total_tokens=prompt_tokens + completion_tokens, + total_time=final_generation.get("total_time"), ), ) From a02d39de31b731b63e9f903998d59ae8c50bd1de Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Tue, 17 Jun 2025 23:09:07 -0400 Subject: [PATCH 5/5] Model: Remove rogue print Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav2/model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 18a06c9..f5b8e2f 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -399,8 +399,6 @@ class ExllamaV2Container(BaseModelContainer): self.draft_config.max_input_len = chunk_size self.draft_config.max_attention_size = chunk_size**2 - print(self.config.max_seq_len) - # Return the created instance return self