Merge pull request #112 from DocShotgun/main

Separate new prompt tokens from those reused from cache in metric logging
This commit is contained in:
Brian Dashore 2024-05-27 18:04:43 -04:00 committed by GitHub
commit 516b52b341
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 9 additions and 2 deletions

View file

@ -1144,6 +1144,7 @@ class ExllamaV2Container:
log_metrics(
result.get("time_enqueued"),
result.get("prompt_tokens"),
result.get("cached_tokens"),
result.get("time_prefill"),
result.get("new_tokens"),
result.get("time_generate"),

View file

@ -72,6 +72,7 @@ def log_response(response: str):
def log_metrics(
queue_time: float,
prompt_tokens: int,
cached_tokens: int,
prompt_time: float,
generated_tokens: int,
generate_time: float,
@ -88,9 +89,14 @@ def log_metrics(
itemization.append(f"Queue: {round(queue_time, 2)} s")
prompt_ts = (
"Indeterminate" if prompt_time == 0 else round(prompt_tokens / prompt_time, 2)
"Indeterminate"
if prompt_time == 0
else round((prompt_tokens - cached_tokens) / prompt_time, 2)
)
itemization.append(
f"Process: {cached_tokens} cached tokens and "
f"{prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s"
)
itemization.append(f"Process: {prompt_ts} T/s")
generate_ts = (
"Indeterminate"