From ead503c75bcab61c26f5aecc444e738c4e6218fc Mon Sep 17 00:00:00 2001 From: Mehran Ziadloo Date: Mon, 27 Nov 2023 20:05:05 -0800 Subject: [PATCH 1/2] Adding token usage support --- OAI/types/chat_completion.py | 2 -- OAI/types/common.py | 2 +- OAI/types/completion.py | 2 -- OAI/utils.py | 20 +++++++++++--------- main.py | 15 ++++++++++++--- model.py | 16 ++++++++++------ 6 files changed, 34 insertions(+), 23 deletions(-) diff --git a/OAI/types/chat_completion.py b/OAI/types/chat_completion.py index 233846d..62353d9 100644 --- a/OAI/types/chat_completion.py +++ b/OAI/types/chat_completion.py @@ -32,8 +32,6 @@ class ChatCompletionResponse(BaseModel): created: int = Field(default_factory=lambda: int(time())) model: str object: str = "chat.completion" - - # TODO: Add usage stats usage: Optional[UsageStats] = None class ChatCompletionStreamChunk(BaseModel): diff --git a/OAI/types/common.py b/OAI/types/common.py index fcdeede..879df0d 100644 --- a/OAI/types/common.py +++ b/OAI/types/common.py @@ -8,8 +8,8 @@ class LogProbs(BaseModel): top_logprobs: List[Dict[str, float]] = Field(default_factory=list) class UsageStats(BaseModel): - completion_tokens: int prompt_tokens: int + completion_tokens: int total_tokens: int class CommonCompletionRequest(BaseModel): diff --git a/OAI/types/completion.py b/OAI/types/completion.py index da90aed..55f54a3 100644 --- a/OAI/types/completion.py +++ b/OAI/types/completion.py @@ -22,6 +22,4 @@ class CompletionResponse(BaseModel): created: int = Field(default_factory=lambda: int(time())) model: str object: str = "text_completion" - - # TODO: Add usage stats usage: Optional[UsageStats] = None diff --git a/OAI/utils.py b/OAI/utils.py index e97f1b7..3cfdfef 100644 --- a/OAI/utils.py +++ b/OAI/utils.py @@ -1,5 +1,5 @@ import os, pathlib -from OAI.types.completion import CompletionResponse, CompletionRespChoice +from OAI.types.completion import CompletionResponse, CompletionRespChoice, UsageStats from OAI.types.chat_completion import ( ChatCompletionMessage, ChatCompletionRespChoice, @@ -20,9 +20,7 @@ try: except ImportError: _fastchat_available = False -def create_completion_response(text: str, model_name: Optional[str]): - # TODO: Add method to get token amounts in model for UsageStats - +def create_completion_response(text: str, prompt_tokens: int, completion_tokens: int, model_name: Optional[str]): choice = CompletionRespChoice( finish_reason = "Generated", text = text @@ -30,14 +28,15 @@ def create_completion_response(text: str, model_name: Optional[str]): response = CompletionResponse( choices = [choice], - model = model_name or "" + model = model_name or "", + usage = UsageStats(prompt_tokens = prompt_tokens, + completion_tokens = completion_tokens, + total_tokens = prompt_tokens + completion_tokens) ) return response -def create_chat_completion_response(text: str, model_name: Optional[str]): - # TODO: Add method to get token amounts in model for UsageStats - +def create_chat_completion_response(text: str, prompt_tokens: int, completion_tokens: int, model_name: Optional[str]): message = ChatCompletionMessage( role = "assistant", content = text @@ -50,7 +49,10 @@ def create_chat_completion_response(text: str, model_name: Optional[str]): response = ChatCompletionResponse( choices = [choice], - model = model_name or "" + model = model_name or "", + usage = UsageStats(prompt_tokens = prompt_tokens, + completion_tokens = completion_tokens, + total_tokens = prompt_tokens + completion_tokens) ) return response diff --git a/main.py b/main.py index 1948373..88918d2 100644 --- a/main.py +++ b/main.py @@ -179,14 +179,20 @@ async def generate_completion(request: Request, data: CompletionRequest): if await request.is_disconnected(): break - response = create_completion_response(part, model_path.name) + response = create_completion_response(part, + model_container.prompt_token_size, + model_container.completion_token_size, + model_path.name) yield response.json(ensure_ascii=False) return EventSourceResponse(generator()) else: response_text = model_container.generate(data.prompt, **data.to_gen_params()) - response = create_completion_response(response_text, model_path.name) + response = create_completion_response(response_text, + model_container.prompt_token_size, + model_container.completion_token_size, + model_path.name) return response @@ -219,7 +225,10 @@ async def generate_chat_completion(request: Request, data: ChatCompletionRequest return EventSourceResponse(generator()) else: response_text = model_container.generate(prompt, **data.to_gen_params()) - response = create_chat_completion_response(response_text, model_path.name) + response = create_chat_completion_response(response_text, + model_container.prompt_token_size, + model_container.completion_token_size, + model_path.name) return response diff --git a/model.py b/model.py index 1ebbdf1..f893860 100644 --- a/model.py +++ b/model.py @@ -32,6 +32,8 @@ class ModelContainer: draft_enabled: bool = False gpu_split_auto: bool = True gpu_split: list or None = None + prompt_token_size: int = 0 + completion_token_size: int = 0 def __init__(self, model_directory: pathlib.Path, quiet = False, **kwargs): """ @@ -333,9 +335,11 @@ class ModelContainer: encode_special_tokens = True ) + self.prompt_token_size = ids.shape[-1] + # Begin - generated_tokens = 0 + self.completion_token_size = 0 full_response = "" start_time = time.time() last_chunk_time = start_time @@ -369,7 +373,7 @@ class ModelContainer: save_tokens = torch.cat((save_tokens, tokens), dim=-1) chunk_buffer += chunk - generated_tokens += 1 + self.completion_token_size += 1 chunk_tokens -= 1 # Yield output @@ -377,21 +381,21 @@ class ModelContainer: now = time.time() elapsed = now - last_chunk_time - if chunk_buffer != "" and (elapsed > stream_interval or eos or generated_tokens == max_tokens): + if chunk_buffer != "" and (elapsed > stream_interval or eos or self.completion_token_size == max_tokens): yield chunk_buffer full_response += chunk_buffer chunk_buffer = "" last_chunk_time = now - if eos or generated_tokens == max_tokens: break + if eos or self.completion_token_size == max_tokens: break elapsed_time = last_chunk_time - start_time - initial_response = f"Response: {round(generated_tokens, 2)} tokens generated in {round(elapsed_time, 2)} seconds" + initial_response = f"Response: {round(self.completion_token_size)} tokens generated in {round(elapsed_time, 2)} seconds" extra_responses = [] # Add tokens per second - extra_responses.append(f"{'Indeterminate' if elapsed_time == 0 else round(generated_tokens / elapsed_time, 2)} T/s") + extra_responses.append(f"{'Indeterminate' if elapsed_time == 0 else round(self.completion_token_size / elapsed_time, 2)} T/s") # Add context (original token count) if ids is not None: From b0c42d0f05836c721be653759607ce30145aa184 Mon Sep 17 00:00:00 2001 From: Mehran Ziadloo Date: Mon, 27 Nov 2023 20:56:56 -0800 Subject: [PATCH 2/2] Leveraging local variables --- main.py | 20 ++++++++++---------- model.py | 24 +++++++++++------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/main.py b/main.py index 88918d2..832da4d 100644 --- a/main.py +++ b/main.py @@ -175,23 +175,23 @@ async def generate_completion(request: Request, data: CompletionRequest): if data.stream: async def generator(): new_generation = model_container.generate_gen(data.prompt, **data.to_gen_params()) - for part in new_generation: + for (part, prompt_tokens, completion_tokens) in new_generation: if await request.is_disconnected(): break response = create_completion_response(part, - model_container.prompt_token_size, - model_container.completion_token_size, + prompt_tokens, + completion_tokens, model_path.name) yield response.json(ensure_ascii=False) return EventSourceResponse(generator()) else: - response_text = model_container.generate(data.prompt, **data.to_gen_params()) + response_text, prompt_tokens, completion_tokens = model_container.generate(data.prompt, **data.to_gen_params()) response = create_completion_response(response_text, - model_container.prompt_token_size, - model_container.completion_token_size, + prompt_tokens, + completion_tokens, model_path.name) return response @@ -209,7 +209,7 @@ async def generate_chat_completion(request: Request, data: ChatCompletionRequest if data.stream: const_id = f"chatcmpl-{uuid4().hex}" async def generator(): - new_generation = model_container.generate_gen(prompt, **data.to_gen_params()) + new_generation, prompt_tokens, completion_tokens = model_container.generate_gen(prompt, **data.to_gen_params()) for part in new_generation: if await request.is_disconnected(): break @@ -224,10 +224,10 @@ async def generate_chat_completion(request: Request, data: ChatCompletionRequest return EventSourceResponse(generator()) else: - response_text = model_container.generate(prompt, **data.to_gen_params()) + response_text, prompt_tokens, completion_tokens = model_container.generate(prompt, **data.to_gen_params()) response = create_chat_completion_response(response_text, - model_container.prompt_token_size, - model_container.completion_token_size, + prompt_tokens, + completion_tokens, model_path.name) return response diff --git a/model.py b/model.py index f893860..4fbde86 100644 --- a/model.py +++ b/model.py @@ -32,8 +32,6 @@ class ModelContainer: draft_enabled: bool = False gpu_split_auto: bool = True gpu_split: list or None = None - prompt_token_size: int = 0 - completion_token_size: int = 0 def __init__(self, model_directory: pathlib.Path, quiet = False, **kwargs): """ @@ -228,9 +226,9 @@ class ModelContainer: def generate(self, prompt: str, **kwargs): - gen = self.generate_gen(prompt, **kwargs) - reponse = "".join(gen) - return reponse + gen = list(self.generate_gen(prompt, **kwargs)) + reponse = "".join(map(lambda o: o[0], gen)) + return reponse, gen[-1][1], gen[-1][2] def generate_gen(self, prompt: str, **kwargs): """ @@ -335,11 +333,11 @@ class ModelContainer: encode_special_tokens = True ) - self.prompt_token_size = ids.shape[-1] + prompt_tokens = ids.shape[-1] # Begin - self.completion_token_size = 0 + generated_tokens = 0 full_response = "" start_time = time.time() last_chunk_time = start_time @@ -373,7 +371,7 @@ class ModelContainer: save_tokens = torch.cat((save_tokens, tokens), dim=-1) chunk_buffer += chunk - self.completion_token_size += 1 + generated_tokens += 1 chunk_tokens -= 1 # Yield output @@ -381,21 +379,21 @@ class ModelContainer: now = time.time() elapsed = now - last_chunk_time - if chunk_buffer != "" and (elapsed > stream_interval or eos or self.completion_token_size == max_tokens): - yield chunk_buffer + if chunk_buffer != "" and (elapsed > stream_interval or eos or generated_tokens == max_tokens): + yield chunk_buffer, prompt_tokens, generated_tokens full_response += chunk_buffer chunk_buffer = "" last_chunk_time = now - if eos or self.completion_token_size == max_tokens: break + if eos or generated_tokens == max_tokens: break elapsed_time = last_chunk_time - start_time - initial_response = f"Response: {round(self.completion_token_size)} tokens generated in {round(elapsed_time, 2)} seconds" + initial_response = f"Response: {round(generated_tokens)} tokens generated in {round(elapsed_time, 2)} seconds" extra_responses = [] # Add tokens per second - extra_responses.append(f"{'Indeterminate' if elapsed_time == 0 else round(self.completion_token_size / elapsed_time, 2)} T/s") + extra_responses.append(f"{'Indeterminate' if elapsed_time == 0 else round(generated_tokens / elapsed_time, 2)} T/s") # Add context (original token count) if ids is not None: