Model: Add tokens/second output

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri 2023-11-17 01:16:20 -05:00
parent 021981fce0
commit 9dfa580b1e

View file

@ -1,5 +1,6 @@
import gc, time, pathlib
import torch
from datetime import datetime
from exllamav2 import(
ExLlamaV2,
ExLlamaV2Config,
@ -305,7 +306,8 @@ class ModelContainer:
generated_tokens = 0
full_response = ""
last_chunk_time = time.time()
start_time = time.time()
last_chunk_time = start_time
save_tokens = torch.empty((1, 0), dtype = torch.bool)
chunk_buffer = ""
@ -350,4 +352,7 @@ class ModelContainer:
chunk_buffer = ""
last_chunk_time = now
if eos or generated_tokens == max_tokens: break
if eos or generated_tokens == max_tokens: break
elapsed_time = last_chunk_time - start_time
print(f"Response generated in {round(elapsed_time, 2)} seconds ({round(generated_tokens / elapsed_time, 2)} T/s)")