Skip to content

Commit

Permalink
Return usage for openai streaming requests (#1663)
Browse files Browse the repository at this point in the history
  • Loading branch information
ichernev committed Nov 16, 2023
1 parent 415d109 commit 686f5e3
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 3 deletions.
29 changes: 26 additions & 3 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ def create_stream_response_json(
index: int,
text: str,
finish_reason: Optional[str] = None,
usage: Optional[UsageInfo] = None,
) -> str:
choice_data = ChatCompletionResponseStreamChoice(
index=index,
Expand All @@ -257,7 +258,10 @@ def create_stream_response_json(
model=model_name,
choices=[choice_data],
)
response_json = response.json(ensure_ascii=False)
if usage is not None:
response.usage = usage
# exclude unset to leave details out of each sse
response_json = response.json(exclude_unset=True, ensure_ascii=False)

return response_json

Expand All @@ -283,17 +287,25 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
i = output.index
delta_text = output.text[len(previous_texts[i]):]
previous_texts[i] = output.text
previous_num_tokens[i] = len(output.token_ids)
completion_tokens = len(output.token_ids)
previous_num_tokens[i] = completion_tokens
response_json = create_stream_response_json(
index=i,
text=delta_text,
)
yield f"data: {response_json}\n\n"
if output.finish_reason is not None:
prompt_tokens = len(res.prompt_token_ids)
final_usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
response_json = create_stream_response_json(
index=i,
text="",
finish_reason=output.finish_reason,
usage=final_usage,
)
yield f"data: {response_json}\n\n"
yield "data: [DONE]\n\n"
Expand Down Expand Up @@ -462,6 +474,7 @@ def create_stream_response_json(
text: str,
logprobs: Optional[LogProbs] = None,
finish_reason: Optional[str] = None,
usage: Optional[UsageInfo] = None,
) -> str:
choice_data = CompletionResponseStreamChoice(
index=index,
Expand All @@ -475,7 +488,9 @@ def create_stream_response_json(
model=model_name,
choices=[choice_data],
)
response_json = response.json(ensure_ascii=False)
if usage is not None:
response.usage = usage
response_json = response.json(exclude_unset=True, ensure_ascii=False)

return response_json

Expand Down Expand Up @@ -505,11 +520,19 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
if output.finish_reason is not None:
logprobs = (LogProbs()
if request.logprobs is not None else None)
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
final_usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
response_json = create_stream_response_json(
index=i,
text="",
logprobs=logprobs,
finish_reason=output.finish_reason,
usage=final_usage,
)
yield f"data: {response_json}\n\n"
yield "data: [DONE]\n\n"
Expand Down
3 changes: 3 additions & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ class CompletionStreamResponse(BaseModel):
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[CompletionResponseStreamChoice]
usage: Optional[UsageInfo]


class ChatMessage(BaseModel):
Expand Down Expand Up @@ -178,3 +179,5 @@ class ChatCompletionStreamResponse(BaseModel):
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseStreamChoice]
usage: Optional[UsageInfo] = Field(
default=None, description="data about request and response")

0 comments on commit 686f5e3

Please sign in to comment.