API: Add timings to usage stats
It's useful for the client to know what the T/s and total time for generation are per-request. Works with both completions and chat completions. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
parent
5d94d4d022
commit
2913ce29fc
6 changed files with 110 additions and 63 deletions
|
|
@ -54,40 +54,29 @@ def log_response(request_id: str, response: str):
|
|||
|
||||
def log_metrics(
|
||||
request_id: str,
|
||||
queue_time: float,
|
||||
prompt_tokens: int,
|
||||
cached_tokens: int,
|
||||
prompt_time: float,
|
||||
generated_tokens: int,
|
||||
generate_time: float,
|
||||
metrics: dict,
|
||||
context_len: Optional[int],
|
||||
max_seq_len: int,
|
||||
):
|
||||
initial_response = (
|
||||
f"Metrics (ID: {request_id}): {generated_tokens} tokens generated in "
|
||||
f"{round(queue_time + prompt_time + generate_time, 2)} seconds"
|
||||
f"Metrics (ID: {request_id}): {metrics.get('gen_tokens')} "
|
||||
f"tokens generated in {metrics.get('total_time')} seconds"
|
||||
)
|
||||
itemization = []
|
||||
extra_parts = []
|
||||
|
||||
itemization.append(f"Queue: {round(queue_time, 2)} s")
|
||||
itemization.append(f"Queue: {metrics.get('queue_time')} s")
|
||||
|
||||
cached_tokens = metrics.get("cached_tokens")
|
||||
prompt_tokens = metrics.get("prompt_tokens")
|
||||
|
||||
prompt_ts = (
|
||||
"Indeterminate"
|
||||
if prompt_time == 0
|
||||
else round((prompt_tokens - cached_tokens) / prompt_time, 2)
|
||||
)
|
||||
itemization.append(
|
||||
f"Process: {cached_tokens} cached tokens and "
|
||||
f"{prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s"
|
||||
f"{prompt_tokens - cached_tokens} new tokens at "
|
||||
f"{metrics.get('prompt_tokens_per_sec')} T/s"
|
||||
)
|
||||
|
||||
generate_ts = (
|
||||
"Indeterminate"
|
||||
if generate_time == 0
|
||||
else round(generated_tokens / generate_time, 2)
|
||||
)
|
||||
itemization.append(f"Generate: {generate_ts} T/s")
|
||||
itemization.append(f"Generate: {metrics.get('gen_tokens_per_sec')} T/s")
|
||||
|
||||
# Add context (original token count)
|
||||
if context_len:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue