From d0f4998fde4b13a41ea7c4e8e6bd46cb2b93ad39 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 6 Jun 2024 19:24:39 +0300 Subject: [PATCH 01/21] Add StreamOptions Class Add stream_options validation in CompletionRequest --- vllm/entrypoints/openai/protocol.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 11ac28e758c39..e6d39097860b9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -96,6 +96,8 @@ class UsageInfo(OpenAIBaseModel): total_tokens: int = 0 completion_tokens: Optional[int] = 0 +class StreamOptions(OpenAIBaseModel): + include_usage: Optional[bool] class ResponseFormat(OpenAIBaseModel): # type must be "json_object" or "text" @@ -313,7 +315,6 @@ def check_logprobs(cls, data): "`top_logprobs` must be a value in the interval [0, 20].") return data - class CompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/completions/create @@ -332,6 +333,7 @@ class CompletionRequest(OpenAIBaseModel): le=torch.iinfo(torch.long).max) stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False + stream_options: Optional[StreamOptions] = None suffix: Optional[str] = None temperature: Optional[float] = 1.0 top_p: Optional[float] = 1.0 @@ -468,6 +470,13 @@ def check_logprobs(cls, data): " in the interval [0, 5].")) return data + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + if data.get("stream_options") and not data.get("stream"): + raise ValueError("Stream options can only be defined when stream is True.") + return data + class EmbeddingRequest(BaseModel): # Ordered by official OpenAI API documentation From 8fee1547546cca452254984b64b1abfe5f55d082 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 6 Jun 2024 19:50:21 +0300 Subject: [PATCH 02/21] - Modified the initial response generation to conditionally include `usage` field based on `stream_options.include_usage`. - Enhanced the token-by-token and finish responses to conditionally include `usage` field if `stream_options.include_usage` is set. - Added a final usage statistics message if `stream_options.include_usage` is set, including prompt tokens and completion tokens. --- vllm/entrypoints/openai/serving_completion.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 572878b5527dc..fbbb73a5f1923 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -264,7 +264,8 @@ async def completion_stream_generator( ) else: final_usage = None - response_json = CompletionStreamResponse( + + chunk= CompletionStreamResponse( id=request_id, created=created_time, model=model_name, @@ -276,10 +277,27 @@ async def completion_stream_generator( finish_reason=finish_reason, stop_reason=stop_reason, ) - ], - usage=final_usage, - ).model_dump_json(exclude_unset=True) + ] + ) + if (request.stream_options and + request.stream_options.include_usage): + chunk.usage = None + + response_json = chunk.model_dump_json(exclude_unset=True) yield f"data: {response_json}\n\n" + + if request.stream_options and request.stream_options.include_usage: + final_usage_chunk = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[], + usage=final_usage, + ) + final_usage_data = (final_usage_chunk.model_dump_json + (exclude_unset=True, exclude_none=True)) + yield f"data: {final_usage_data}\n\n" + except ValueError as e: # TODO: Use a vllm-specific Validation Error data = self.create_streaming_error_response(str(e)) From 03c309e56fa5bfb7ac9bc5aa5d486249a5e5c9de Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 6 Jun 2024 19:56:08 +0300 Subject: [PATCH 03/21] Tests added for the following scenarios: - stream=True, stream_options=None - stream=True, stream_options={"include_usage": True} - stream=True, stream_options={"include_usage": False} - stream=False, stream_options={"include_usage": None} - stream=False, stream_options={"include_usage": False} - stream=False, stream_options={"include_usage": True} --- tests/entrypoints/test_openai_server.py | 104 ++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 3721b047e43d9..e74f4121506a7 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1342,6 +1342,110 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.total_tokens == 17 +async def test_completion_stream_options(server, client: openai.AsyncOpenAI, + model_name: str): + prompt = "What is the capital of France?" + + # Test stream=True, stream_options=None + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options=None, + ) + chunks = [] + async for chunk in stream: + chunks.append(chunk.choices[0].text) + assert len(chunks) > 0 + assert "usage" not in chunk + + # Test stream=True, stream_options={"include_usage": False} + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={"include_usage": False}, + ) + chunks = [] + async for chunk in stream: + chunks.append(chunk.choices[0].text) + assert len(chunks) > 0 + assert "usage" not in chunk + + # Test stream=True, stream_options={"include_usage": True} + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={"include_usage": True}, + ) + chunks = [] + finish_reason_count = 0 + async for chunk in stream: + if chunk.choices[0].finish_reason is None: + assert chunk.usage is None + chunks.append(chunk.choices[0].text) + else: + assert chunk.usage is None + finish_reason_count += 1 + + # The last message should have usage and no choices + last_message = await stream.__anext__() + assert last_message.usage is not None + assert last_message.usage.prompt_tokens > 0 + assert last_message.usage.completion_tokens > 0 + assert (last_message.usage.total_tokens == + last_message.usage.prompt_tokens + + last_message.usage.completion_tokens) + assert last_message.choices == [] + + # Test stream=False, stream_options=None + response = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options=None, + ) + assert response.usage is not None + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert (response.usage.total_tokens == + response.usage.prompt_tokens + response.usage.completion_tokens) + + # Test stream=False, stream_options={"include_usage": False} + response = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": False}, + ) + assert response.usage is None + + # Test stream=False, stream_options={"include_usage": True} + response = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}, + ) + assert response.usage is not None + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert (response.usage.total_tokens == response.usage.prompt_tokens + + response.usage.completion_tokens) + if __name__ == "__main__": pytest.main([__file__]) From a592cd8899ef5827344839727ded664a9e7b97c9 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 6 Jun 2024 20:03:16 +0300 Subject: [PATCH 04/21] Fixed issues related to formatting. format.sh. --- tests/entrypoints/test_openai_server.py | 15 ++++++++------- vllm/entrypoints/openai/protocol.py | 6 +++++- vllm/entrypoints/openai/serving_completion.py | 18 +++++++++--------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index e74f4121506a7..304927bd669c1 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1342,7 +1342,8 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.total_tokens == 17 -async def test_completion_stream_options(server, client: openai.AsyncOpenAI, + +async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): prompt = "What is the capital of France?" @@ -1400,9 +1401,9 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, assert last_message.usage is not None assert last_message.usage.prompt_tokens > 0 assert last_message.usage.completion_tokens > 0 - assert (last_message.usage.total_tokens == - last_message.usage.prompt_tokens + - last_message.usage.completion_tokens) + assert ( + last_message.usage.total_tokens == last_message.usage.prompt_tokens + + last_message.usage.completion_tokens) assert last_message.choices == [] # Test stream=False, stream_options=None @@ -1417,8 +1418,8 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, assert response.usage is not None assert response.usage.prompt_tokens > 0 assert response.usage.completion_tokens > 0 - assert (response.usage.total_tokens == - response.usage.prompt_tokens + response.usage.completion_tokens) + assert (response.usage.total_tokens == response.usage.prompt_tokens + + response.usage.completion_tokens) # Test stream=False, stream_options={"include_usage": False} response = await client.completions.create( @@ -1443,7 +1444,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, assert response.usage is not None assert response.usage.prompt_tokens > 0 assert response.usage.completion_tokens > 0 - assert (response.usage.total_tokens == response.usage.prompt_tokens + + assert (response.usage.total_tokens == response.usage.prompt_tokens + response.usage.completion_tokens) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e6d39097860b9..868cf8a1988f8 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -96,9 +96,11 @@ class UsageInfo(OpenAIBaseModel): total_tokens: int = 0 completion_tokens: Optional[int] = 0 + class StreamOptions(OpenAIBaseModel): include_usage: Optional[bool] + class ResponseFormat(OpenAIBaseModel): # type must be "json_object" or "text" type: Literal["text", "json_object"] @@ -315,6 +317,7 @@ def check_logprobs(cls, data): "`top_logprobs` must be a value in the interval [0, 20].") return data + class CompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/completions/create @@ -474,7 +477,8 @@ def check_logprobs(cls, data): @classmethod def validate_stream_options(cls, data): if data.get("stream_options") and not data.get("stream"): - raise ValueError("Stream options can only be defined when stream is True.") + raise ValueError( + "Stream options can only be defined when stream is True.") return data diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index fbbb73a5f1923..9a8f424a18008 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -264,8 +264,8 @@ async def completion_stream_generator( ) else: final_usage = None - - chunk= CompletionStreamResponse( + + chunk = CompletionStreamResponse( id=request_id, created=created_time, model=model_name, @@ -277,16 +277,16 @@ async def completion_stream_generator( finish_reason=finish_reason, stop_reason=stop_reason, ) - ] - ) - if (request.stream_options and - request.stream_options.include_usage): + ]) + if (request.stream_options + and request.stream_options.include_usage): chunk.usage = None response_json = chunk.model_dump_json(exclude_unset=True) yield f"data: {response_json}\n\n" - if request.stream_options and request.stream_options.include_usage: + if (request.stream_options + and request.stream_options.include_usage): final_usage_chunk = CompletionStreamResponse( id=request_id, created=created_time, @@ -294,8 +294,8 @@ async def completion_stream_generator( choices=[], usage=final_usage, ) - final_usage_data = (final_usage_chunk.model_dump_json - (exclude_unset=True, exclude_none=True)) + final_usage_data = (final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True)) yield f"data: {final_usage_data}\n\n" except ValueError as e: From a7319e11b130e2c8bbd7925f6e03c9d4360ab403 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Fri, 7 Jun 2024 16:07:17 +0300 Subject: [PATCH 05/21] FIxing testing file. Noted by DrakLIght there was two issues: 1. Notation of `@pytest.mark.asyncio` on the test function. 2. Checking on chunk usage on a non exisiting variable. --- tests/entrypoints/test_openai_server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 304927bd669c1..d7138a54fa4a9 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1342,7 +1342,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.total_tokens == 17 - +@pytest.mark.asyncio async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): prompt = "What is the capital of France?" @@ -1360,7 +1360,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, async for chunk in stream: chunks.append(chunk.choices[0].text) assert len(chunks) > 0 - assert "usage" not in chunk + assert all(chunk.usage is None for chunk in chunks) # Test stream=True, stream_options={"include_usage": False} stream = await client.completions.create( @@ -1375,7 +1375,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, async for chunk in stream: chunks.append(chunk.choices[0].text) assert len(chunks) > 0 - assert "usage" not in chunk + assert all(chunk.usage is None for chunk in chunks) # Test stream=True, stream_options={"include_usage": True} stream = await client.completions.create( From 4d33e2878a05d7fe21dbf8fd27fbc70782c05936 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Fri, 7 Jun 2024 16:14:26 +0300 Subject: [PATCH 06/21] Ran formater.sh to fix formatting issues. Should have done that on prev commit TBH --- tests/entrypoints/test_openai_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index d7138a54fa4a9..86fb316c8ecfb 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1342,6 +1342,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.total_tokens == 17 + @pytest.mark.asyncio async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): From d6ac8910211d5f3a4e2897ddbc8c4b459e54a515 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Fri, 7 Jun 2024 16:58:35 +0300 Subject: [PATCH 07/21] Fixed formating. -- Removed redundent StreamOptions. -- Formater.sh --- tests/entrypoints/test_openai_server.py | 10 +++++++--- vllm/entrypoints/openai/protocol.py | 4 ---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 9870e6b16986a..5cc61ad2b8ac9 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1342,12 +1342,14 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.total_tokens == 17 + @pytest.mark.parametrize( "model_name", [MODEL_NAME], ) -async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_chat_completion_stream_options(server, + client: openai.AsyncOpenAI, + model_name: str): prompt = "What is the capital of France?" # Test stream=True, stream_options=None @@ -1442,6 +1444,7 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI stream_options={"include_usage": True}, ) + @pytest.mark.asyncio async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): @@ -1546,6 +1549,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, assert response.usage.completion_tokens > 0 assert (response.usage.total_tokens == response.usage.prompt_tokens + response.usage.completion_tokens) - + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index bfc42b306a36a..9424ccc959d11 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -97,10 +97,6 @@ class UsageInfo(OpenAIBaseModel): completion_tokens: Optional[int] = 0 -class StreamOptions(OpenAIBaseModel): - include_usage: Optional[bool] - - class ResponseFormat(OpenAIBaseModel): # type must be "json_object" or "text" type: Literal["text", "json_object"] From 1473a7fcdda5710a77505c32e55227230cccd9cf Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sat, 8 Jun 2024 15:51:08 +0300 Subject: [PATCH 08/21] Tests fixture: -- Added parametrize in completion stream options. -- Revised streaming tests as the usage is no longer needed to be asserted. --- tests/entrypoints/test_openai_server.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 5cc61ad2b8ac9..005ae2d6c6e69 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -495,7 +495,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, assert finish_reason_count == 1 assert chunk.choices[0].finish_reason == "length" assert chunk.choices[0].text - assert chunk.usage == single_usage assert "".join(chunks) == single_output @@ -1446,6 +1445,10 @@ async def test_chat_completion_stream_options(server, @pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): prompt = "What is the capital of France?" From 3d987d042455cf0c119ee5302f4036831690bfb0 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sat, 8 Jun 2024 15:55:16 +0300 Subject: [PATCH 09/21] Fixing testing. -- single_usage is no longer needed inside test completion streaming. --- tests/entrypoints/test_openai_server.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 005ae2d6c6e69..476ec40819762 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -478,8 +478,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, temperature=0.0, ) single_output = single_completion.choices[0].text - single_usage = single_completion.usage - stream = await client.completions.create(model=model_name, prompt=prompt, max_tokens=5, @@ -1342,6 +1340,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 17 +@pytest.mark.asyncio @pytest.mark.parametrize( "model_name", [MODEL_NAME], From d25d0554c342eaa713626a88cfdfa8d38ebdca5a Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 09:38:25 +0300 Subject: [PATCH 10/21] Tests Related: -- Resolved concerns raised by DarkLight (Mistake related to client.chat.completions.creat) -- Resolved issue related to MODEL_NAME, --- tests/entrypoints/test_openai_server.py | 60 +++++++++++++------------ 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 476ec40819762..b7a15f1efaf76 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1343,61 +1343,65 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME], + [MODEL_NAME, "zephyr-lora"], ) async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): - prompt = "What is the capital of France?" + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "What is the capital of France?" + }] # Test stream=True, stream_options=None - stream = await client.completions.create( + stream = await client.chat.completions.create( model=model_name, - prompt=prompt, - max_tokens=5, + messages=messages, + max_tokens=10, temperature=0.0, stream=True, stream_options=None, ) chunks = [] async for chunk in stream: - chunks.append(chunk.choices[0].text) + chunks.append(chunk.choices[0].delta.get("content", "")) assert len(chunks) > 0 - assert "usage" not in chunk + assert all(chunk.usage is None for chunk in chunks) # Test stream=True, stream_options={"include_usage": False} - stream = await client.completions.create( + stream = await client.chat.completions.create( model=model_name, - prompt=prompt, - max_tokens=5, + messages=messages, + max_tokens=10, temperature=0.0, stream=True, stream_options={"include_usage": False}, ) chunks = [] async for chunk in stream: - chunks.append(chunk.choices[0].text) + chunks.append(chunk.choices[0].delta.get("content", "")) assert len(chunks) > 0 - assert "usage" not in chunk + assert all(chunk.usage is None for chunk in chunks) # Test stream=True, stream_options={"include_usage": True} - stream = await client.completions.create( + stream = await client.chat.completions.create( model=model_name, - prompt=prompt, - max_tokens=5, + messages=messages, + max_tokens=10, temperature=0.0, stream=True, stream_options={"include_usage": True}, ) chunks = [] - finish_reason_count = 0 async for chunk in stream: if chunk.choices[0].finish_reason is None: assert chunk.usage is None - chunks.append(chunk.choices[0].text) + chunks.append(chunk.choices[0].delta.get("content", "")) else: assert chunk.usage is None - finish_reason_count += 1 # The last message should have usage and no choices last_message = await stream.__anext__() @@ -1411,10 +1415,10 @@ async def test_chat_completion_stream_options(server, # Test stream=False, stream_options={"include_usage": None} with pytest.raises(BadRequestError): - await client.completions.create( + await client.chat.completions.create( model=model_name, - prompt=prompt, - max_tokens=5, + messages=messages, + max_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": None}, @@ -1422,10 +1426,10 @@ async def test_chat_completion_stream_options(server, # Test stream=False, stream_options={"include_usage": False} with pytest.raises(BadRequestError): - await client.completions.create( + await client.chat.completions.create( model=model_name, - prompt=prompt, - max_tokens=5, + messages=messages, + max_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": False}, @@ -1433,10 +1437,10 @@ async def test_chat_completion_stream_options(server, # Test stream=False, stream_options={"include_usage": True} with pytest.raises(BadRequestError): - await client.completions.create( + await client.chat.completions.create( model=model_name, - prompt=prompt, - max_tokens=5, + messages=messages, + max_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": True}, @@ -1446,7 +1450,7 @@ async def test_chat_completion_stream_options(server, @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME], + [MODEL_NAME, "zephyr-lora"], ) async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): From 2e96a592226148c684fa767f7cb2ab36d612f9bc Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 14:57:51 +0300 Subject: [PATCH 11/21] Fixing Testing. -- Redundent test removed (stream=True, stream_options=None) --- tests/entrypoints/test_openai_server.py | 32 +------------------------ 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index b7a15f1efaf76..c8001905af315 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1356,21 +1356,6 @@ async def test_chat_completion_stream_options(server, "content": "What is the capital of France?" }] - # Test stream=True, stream_options=None - stream = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - stream_options=None, - ) - chunks = [] - async for chunk in stream: - chunks.append(chunk.choices[0].delta.get("content", "")) - assert len(chunks) > 0 - assert all(chunk.usage is None for chunk in chunks) - # Test stream=True, stream_options={"include_usage": False} stream = await client.chat.completions.create( model=model_name, @@ -1453,24 +1438,9 @@ async def test_chat_completion_stream_options(server, [MODEL_NAME, "zephyr-lora"], ) async def test_completion_stream_options(server, client: openai.AsyncOpenAI, - model_name: str): + model_name: str): prompt = "What is the capital of France?" - # Test stream=True, stream_options=None - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options=None, - ) - chunks = [] - async for chunk in stream: - chunks.append(chunk.choices[0].text) - assert len(chunks) > 0 - assert all(chunk.usage is None for chunk in chunks) - # Test stream=True, stream_options={"include_usage": False} stream = await client.completions.create( model=model_name, From 5c1c0c6f04619a8ad938c94477929078ebc17666 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 15:02:32 +0300 Subject: [PATCH 12/21] Fortmater fixure. --- tests/entrypoints/test_openai_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index c8001905af315..3f2ee5c3d481a 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1438,7 +1438,7 @@ async def test_chat_completion_stream_options(server, [MODEL_NAME, "zephyr-lora"], ) async def test_completion_stream_options(server, client: openai.AsyncOpenAI, - model_name: str): + model_name: str): prompt = "What is the capital of France?" # Test stream=True, stream_options={"include_usage": False} From 23aa9036693132d339244e5b4d451cbd446df72e Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 16:04:15 +0300 Subject: [PATCH 13/21] Fixed testing. -- Not found. --- tests/entrypoints/test_openai_server.py | 41 +++++++++---------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 3f2ee5c3d481a..ca24569e3f3e8 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1363,8 +1363,7 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=True, - stream_options={"include_usage": False}, - ) + stream_options={"include_usage": False}) chunks = [] async for chunk in stream: chunks.append(chunk.choices[0].delta.get("content", "")) @@ -1378,8 +1377,7 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=True, - stream_options={"include_usage": True}, - ) + stream_options={"include_usage": True}) chunks = [] async for chunk in stream: if chunk.choices[0].finish_reason is None: @@ -1406,8 +1404,7 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=False, - stream_options={"include_usage": None}, - ) + stream_options={"include_usage": None}) # Test stream=False, stream_options={"include_usage": False} with pytest.raises(BadRequestError): @@ -1417,8 +1414,7 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=False, - stream_options={"include_usage": False}, - ) + stream_options={"include_usage": False}) # Test stream=False, stream_options={"include_usage": True} with pytest.raises(BadRequestError): @@ -1428,8 +1424,7 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=False, - stream_options={"include_usage": True}, - ) + stream_options={"include_usage": True}) @pytest.mark.asyncio @@ -1448,8 +1443,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True, - stream_options={"include_usage": False}, - ) + stream_options={"include_usage": False}) chunks = [] async for chunk in stream: chunks.append(chunk.choices[0].text) @@ -1463,8 +1457,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True, - stream_options={"include_usage": True}, - ) + stream_options={"include_usage": True}) chunks = [] finish_reason_count = 0 async for chunk in stream: @@ -1486,14 +1479,12 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, assert last_message.choices == [] # Test stream=False, stream_options=None - response = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options=None, - ) + response = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options=None) assert response.usage is not None assert response.usage.prompt_tokens > 0 assert response.usage.completion_tokens > 0 @@ -1507,8 +1498,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=False, - stream_options={"include_usage": False}, - ) + stream_options={"include_usage": False}) assert response.usage is None # Test stream=False, stream_options={"include_usage": True} @@ -1518,8 +1508,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=False, - stream_options={"include_usage": True}, - ) + stream_options={"include_usage": True}) assert response.usage is not None assert response.usage.prompt_tokens > 0 assert response.usage.completion_tokens > 0 From 776c73b8323e31b81f215a52beb55df295ce6a2e Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 17:26:43 +0300 Subject: [PATCH 14/21] Reorder tests in server test file to resolve conflicts - Moved stream options tests before embeddings tests to address conflicts between pytest fixtures. - This change is in response to a suggestion from DarkLight1337. - Adjustments made to ensure test suite runs without errors. --- tests/entrypoints/test_openai_server.py | 351 ++++++++++++------------ 1 file changed, 175 insertions(+), 176 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index ca24569e3f3e8..ff5779dbb802c 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -547,6 +547,181 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, assert "".join(chunks) == output +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_chat_completion_stream_options(server, + client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "What is the capital of France?" + }] + + # Test stream=True, stream_options={"include_usage": False} + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={"include_usage": False}) + chunks = [] + async for chunk in stream: + chunks.append(chunk.choices[0].delta.get("content", "")) + assert len(chunks) > 0 + assert all(chunk.usage is None for chunk in chunks) + + # Test stream=True, stream_options={"include_usage": True} + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={"include_usage": True}) + chunks = [] + async for chunk in stream: + if chunk.choices[0].finish_reason is None: + assert chunk.usage is None + chunks.append(chunk.choices[0].delta.get("content", "")) + else: + assert chunk.usage is None + + # The last message should have usage and no choices + last_message = await stream.__anext__() + assert last_message.usage is not None + assert last_message.usage.prompt_tokens > 0 + assert last_message.usage.completion_tokens > 0 + assert last_message.usage.total_tokens == ( + last_message.usage.prompt_tokens + + last_message.usage.completion_tokens) + assert last_message.choices == [] + + # Test stream=False, stream_options={"include_usage": None} + with pytest.raises(BadRequestError): + await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + stream_options={"include_usage": None}) + + # Test stream=False, stream_options={"include_usage": False} + with pytest.raises(BadRequestError): + await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + stream_options={"include_usage": False}) + + # Test stream=False, stream_options={"include_usage": True} + with pytest.raises(BadRequestError): + await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_completion_stream_options(server, client: openai.AsyncOpenAI, + model_name: str): + prompt = "What is the capital of France?" + + # Test stream=True, stream_options={"include_usage": False} + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={"include_usage": False}) + chunks = [] + async for chunk in stream: + chunks.append(chunk.choices[0].text) + assert len(chunks) > 0 + assert all(chunk.usage is None for chunk in chunks) + + # Test stream=True, stream_options={"include_usage": True} + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={"include_usage": True}) + chunks = [] + finish_reason_count = 0 + async for chunk in stream: + if chunk.choices[0].finish_reason is None: + assert chunk.usage is None + chunks.append(chunk.choices[0].text) + else: + assert chunk.usage is None + finish_reason_count += 1 + + # The last message should have usage and no choices + last_message = await stream.__anext__() + assert last_message.usage is not None + assert last_message.usage.prompt_tokens > 0 + assert last_message.usage.completion_tokens > 0 + assert ( + last_message.usage.total_tokens == last_message.usage.prompt_tokens + + last_message.usage.completion_tokens) + assert last_message.choices == [] + + # Test stream=False, stream_options=None + response = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options=None) + assert response.usage is not None + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert (response.usage.total_tokens == response.usage.prompt_tokens + + response.usage.completion_tokens) + + # Test stream=False, stream_options={"include_usage": False} + response = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": False}) + assert response.usage is None + + # Test stream=False, stream_options={"include_usage": True} + response = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}) + assert response.usage is not None + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert (response.usage.total_tokens == response.usage.prompt_tokens + + response.usage.completion_tokens) + @pytest.mark.asyncio @pytest.mark.parametrize( # just test 1 lora hereafter @@ -1340,181 +1515,5 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 17 -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_chat_completion_stream_options(server, - client: openai.AsyncOpenAI, - model_name: str): - messages = [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "What is the capital of France?" - }] - - # Test stream=True, stream_options={"include_usage": False} - stream = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - stream_options={"include_usage": False}) - chunks = [] - async for chunk in stream: - chunks.append(chunk.choices[0].delta.get("content", "")) - assert len(chunks) > 0 - assert all(chunk.usage is None for chunk in chunks) - - # Test stream=True, stream_options={"include_usage": True} - stream = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - stream_options={"include_usage": True}) - chunks = [] - async for chunk in stream: - if chunk.choices[0].finish_reason is None: - assert chunk.usage is None - chunks.append(chunk.choices[0].delta.get("content", "")) - else: - assert chunk.usage is None - - # The last message should have usage and no choices - last_message = await stream.__anext__() - assert last_message.usage is not None - assert last_message.usage.prompt_tokens > 0 - assert last_message.usage.completion_tokens > 0 - assert last_message.usage.total_tokens == ( - last_message.usage.prompt_tokens + - last_message.usage.completion_tokens) - assert last_message.choices == [] - - # Test stream=False, stream_options={"include_usage": None} - with pytest.raises(BadRequestError): - await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=False, - stream_options={"include_usage": None}) - - # Test stream=False, stream_options={"include_usage": False} - with pytest.raises(BadRequestError): - await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=False, - stream_options={"include_usage": False}) - - # Test stream=False, stream_options={"include_usage": True} - with pytest.raises(BadRequestError): - await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_completion_stream_options(server, client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is the capital of France?" - - # Test stream=True, stream_options={"include_usage": False} - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={"include_usage": False}) - chunks = [] - async for chunk in stream: - chunks.append(chunk.choices[0].text) - assert len(chunks) > 0 - assert all(chunk.usage is None for chunk in chunks) - - # Test stream=True, stream_options={"include_usage": True} - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={"include_usage": True}) - chunks = [] - finish_reason_count = 0 - async for chunk in stream: - if chunk.choices[0].finish_reason is None: - assert chunk.usage is None - chunks.append(chunk.choices[0].text) - else: - assert chunk.usage is None - finish_reason_count += 1 - - # The last message should have usage and no choices - last_message = await stream.__anext__() - assert last_message.usage is not None - assert last_message.usage.prompt_tokens > 0 - assert last_message.usage.completion_tokens > 0 - assert ( - last_message.usage.total_tokens == last_message.usage.prompt_tokens + - last_message.usage.completion_tokens) - assert last_message.choices == [] - - # Test stream=False, stream_options=None - response = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options=None) - assert response.usage is not None - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert (response.usage.total_tokens == response.usage.prompt_tokens + - response.usage.completion_tokens) - - # Test stream=False, stream_options={"include_usage": False} - response = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": False}) - assert response.usage is None - - # Test stream=False, stream_options={"include_usage": True} - response = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}) - assert response.usage is not None - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert (response.usage.total_tokens == response.usage.prompt_tokens + - response.usage.completion_tokens) - - if __name__ == "__main__": pytest.main([__file__]) From 4c828f4dcba821220098d354da98a381af6a651b Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 17:29:01 +0300 Subject: [PATCH 15/21] Reorder tests in server test file to resolve conflicts - Moved stream options tests before embeddings tests to address conflicts between pytest fixtures. - This change is in response to a suggestion from DarkLight1337. - Adjustments made to ensure test suite runs without errors. -- Formated code --- tests/entrypoints/test_openai_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index ff5779dbb802c..9296148a0930c 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -721,7 +721,8 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, assert response.usage.completion_tokens > 0 assert (response.usage.total_tokens == response.usage.prompt_tokens + response.usage.completion_tokens) - + + @pytest.mark.asyncio @pytest.mark.parametrize( # just test 1 lora hereafter From 86d6d7a13192a45c8cdf097a1a56f31940f880e0 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 19:33:35 +0300 Subject: [PATCH 16/21] feat(tests): Update test cases for chat and completion streaming options - **Stream with `include_usage: False`**: - Added assertions to ensure no chunk contains the `usage` key. - **Stream with `include_usage: True`**: - Modified test logic to verify that every chunk has `usage` as `None` except for the last chunk, which should have `usage` populated. - **Stream=False configurations**: - Added tests to verify that using `stream_options: {"include_usage": None}`, `{"include_usage": False}`, and `{"include_usage": True}` raises a `BadRequestError`. - Removed redundant test for `stream=False` with `stream_options: {"include_usage": False}` as it overlaps with the error condition checks. --- tests/entrypoints/test_openai_server.py | 163 +++++++++--------------- 1 file changed, 63 insertions(+), 100 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 9296148a0930c..e7c7178b38db1 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -547,21 +547,20 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, assert "".join(chunks) == output +import pytest +import openai +from openai.error import BadRequestError + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora"], + ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ) -async def test_chat_completion_stream_options(server, - client: openai.AsyncOpenAI, - model_name: str): - messages = [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "What is the capital of France?" - }] +async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"} + ] # Test stream=True, stream_options={"include_usage": False} stream = await client.chat.completions.create( @@ -570,12 +569,10 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=True, - stream_options={"include_usage": False}) - chunks = [] + stream_options={"include_usage": False} + ) async for chunk in stream: - chunks.append(chunk.choices[0].delta.get("content", "")) - assert len(chunks) > 0 - assert all(chunk.usage is None for chunk in chunks) + assert "usage" not in chunk.__dict__ # Test stream=True, stream_options={"include_usage": True} stream = await client.chat.completions.create( @@ -584,24 +581,21 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=True, - stream_options={"include_usage": True}) - chunks = [] + stream_options={"include_usage": True} + ) async for chunk in stream: if chunk.choices[0].finish_reason is None: assert chunk.usage is None - chunks.append(chunk.choices[0].delta.get("content", "")) else: - assert chunk.usage is None - - # The last message should have usage and no choices - last_message = await stream.__anext__() - assert last_message.usage is not None - assert last_message.usage.prompt_tokens > 0 - assert last_message.usage.completion_tokens > 0 - assert last_message.usage.total_tokens == ( - last_message.usage.prompt_tokens + - last_message.usage.completion_tokens) - assert last_message.choices == [] + assert chunk.usage is None # Last chunk in stream should have usage as None + final_chunk = await stream.__anext__() + assert final_chunk.usage is not None + assert final_chunk.usage.prompt_tokens > 0 + assert final_chunk.usage.completion_tokens > 0 + assert final_chunk.usage.total_tokens == ( + final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens + ) + assert final_chunk.choices == [] # Test stream=False, stream_options={"include_usage": None} with pytest.raises(BadRequestError): @@ -611,17 +605,8 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=False, - stream_options={"include_usage": None}) - - # Test stream=False, stream_options={"include_usage": False} - with pytest.raises(BadRequestError): - await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=False, - stream_options={"include_usage": False}) + stream_options={"include_usage": None} + ) # Test stream=False, stream_options={"include_usage": True} with pytest.raises(BadRequestError): @@ -631,16 +616,16 @@ async def test_chat_completion_stream_options(server, max_tokens=10, temperature=0.0, stream=False, - stream_options={"include_usage": True}) + stream_options={"include_usage": True} + ) @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora"], + ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ) -async def test_completion_stream_options(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): prompt = "What is the capital of France?" # Test stream=True, stream_options={"include_usage": False} @@ -650,12 +635,10 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True, - stream_options={"include_usage": False}) - chunks = [] + stream_options={"include_usage": False} + ) async for chunk in stream: - chunks.append(chunk.choices[0].text) - assert len(chunks) > 0 - assert all(chunk.usage is None for chunk in chunks) + assert "usage" not in chunk.__dict__ # Test stream=True, stream_options={"include_usage": True} stream = await client.completions.create( @@ -664,63 +647,43 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True, - stream_options={"include_usage": True}) - chunks = [] - finish_reason_count = 0 + stream_options={"include_usage": True} + ) async for chunk in stream: if chunk.choices[0].finish_reason is None: assert chunk.usage is None - chunks.append(chunk.choices[0].text) else: - assert chunk.usage is None - finish_reason_count += 1 + assert chunk.usage is None # The last chunk should have `usage` filled + final_chunk = await stream.__anext__() + assert final_chunk.usage is not None + assert final_chunk.usage.prompt_tokens > 0 + assert final_chunk.usage.completion_tokens > 0 + assert final_chunk.usage.total_tokens == ( + final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens + ) + assert final_chunk.choices == [] - # The last message should have usage and no choices - last_message = await stream.__anext__() - assert last_message.usage is not None - assert last_message.usage.prompt_tokens > 0 - assert last_message.usage.completion_tokens > 0 - assert ( - last_message.usage.total_tokens == last_message.usage.prompt_tokens + - last_message.usage.completion_tokens) - assert last_message.choices == [] - - # Test stream=False, stream_options=None - response = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options=None) - assert response.usage is not None - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert (response.usage.total_tokens == response.usage.prompt_tokens + - response.usage.completion_tokens) - - # Test stream=False, stream_options={"include_usage": False} - response = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": False}) - assert response.usage is None + # Test stream=False, stream_options={"include_usage": None} + with pytest.raises(BadRequestError): + await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": None} + ) # Test stream=False, stream_options={"include_usage": True} - response = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}) - assert response.usage is not None - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert (response.usage.total_tokens == response.usage.prompt_tokens + - response.usage.completion_tokens) + with pytest.raises(BadRequestError): + await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": True} + ) @pytest.mark.asyncio From fb6ae02a9871c129b68e4843d737155c6c35999b Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 19:34:50 +0300 Subject: [PATCH 17/21] Formatting related issues resolved. --- tests/entrypoints/test_openai_server.py | 80 +++++++++++-------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index e7c7178b38db1..29ff2ed5c7776 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -547,20 +547,21 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, assert "".join(chunks) == output -import pytest -import openai -from openai.error import BadRequestError - @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ) -async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "What is the capital of France?"} - ] +async def test_chat_completion_stream_options(server, + client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "What is the capital of France?" + }] # Test stream=True, stream_options={"include_usage": False} stream = await client.chat.completions.create( @@ -569,8 +570,7 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI max_tokens=10, temperature=0.0, stream=True, - stream_options={"include_usage": False} - ) + stream_options={"include_usage": False}) async for chunk in stream: assert "usage" not in chunk.__dict__ @@ -581,20 +581,19 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI max_tokens=10, temperature=0.0, stream=True, - stream_options={"include_usage": True} - ) + stream_options={"include_usage": True}) async for chunk in stream: if chunk.choices[0].finish_reason is None: assert chunk.usage is None else: - assert chunk.usage is None # Last chunk in stream should have usage as None + assert chunk.usage is None final_chunk = await stream.__anext__() assert final_chunk.usage is not None assert final_chunk.usage.prompt_tokens > 0 assert final_chunk.usage.completion_tokens > 0 assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens - ) + final_chunk.usage.prompt_tokens + + final_chunk.usage.completion_tokens) assert final_chunk.choices == [] # Test stream=False, stream_options={"include_usage": None} @@ -605,8 +604,7 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI max_tokens=10, temperature=0.0, stream=False, - stream_options={"include_usage": None} - ) + stream_options={"include_usage": None}) # Test stream=False, stream_options={"include_usage": True} with pytest.raises(BadRequestError): @@ -616,8 +614,7 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI max_tokens=10, temperature=0.0, stream=False, - stream_options={"include_usage": True} - ) + stream_options={"include_usage": True}) @pytest.mark.asyncio @@ -625,7 +622,8 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI "model_name", ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ) -async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str): +async def test_completion_stream_options(server, client: openai.AsyncOpenAI, + model_name: str): prompt = "What is the capital of France?" # Test stream=True, stream_options={"include_usage": False} @@ -635,8 +633,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, mod max_tokens=5, temperature=0.0, stream=True, - stream_options={"include_usage": False} - ) + stream_options={"include_usage": False}) async for chunk in stream: assert "usage" not in chunk.__dict__ @@ -647,43 +644,38 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, mod max_tokens=5, temperature=0.0, stream=True, - stream_options={"include_usage": True} - ) + stream_options={"include_usage": True}) async for chunk in stream: if chunk.choices[0].finish_reason is None: assert chunk.usage is None else: - assert chunk.usage is None # The last chunk should have `usage` filled + assert chunk.usage is None final_chunk = await stream.__anext__() assert final_chunk.usage is not None assert final_chunk.usage.prompt_tokens > 0 assert final_chunk.usage.completion_tokens > 0 assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens - ) + final_chunk.usage.prompt_tokens + + final_chunk.usage.completion_tokens) assert final_chunk.choices == [] # Test stream=False, stream_options={"include_usage": None} with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": None} - ) + await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": None}) # Test stream=False, stream_options={"include_usage": True} with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": True} - ) + await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}) @pytest.mark.asyncio From 22cc139fd7e381b37b202a1944b3967dbc831523 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 9 Jun 2024 21:39:27 +0300 Subject: [PATCH 18/21] fix(tests): Update streaming tests for correct handling of 'usage' attribute - **Stream with `include_usage: False`**: - Updated tests to assert that the `usage` attribute is `None` instead of checking its absence in the chunk dictionary. This aligns with the observed behavior where `usage` is present but set to `None`. --- tests/entrypoints/test_openai_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 29ff2ed5c7776..dd3389862eaf5 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -572,7 +572,7 @@ async def test_chat_completion_stream_options(server, stream=True, stream_options={"include_usage": False}) async for chunk in stream: - assert "usage" not in chunk.__dict__ + assert chunk.usage is None # Test stream=True, stream_options={"include_usage": True} stream = await client.chat.completions.create( @@ -635,7 +635,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, stream=True, stream_options={"include_usage": False}) async for chunk in stream: - assert "usage" not in chunk.__dict__ + assert chunk.usage is None # Test stream=True, stream_options={"include_usage": True} stream = await client.completions.create( From 0d9b6b13988d45e9127a369545b613050112a8a7 Mon Sep 17 00:00:00 2001 From: Itay Etelis <92247226+Etelis@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:59:44 +0300 Subject: [PATCH 19/21] Fix: Incorrect indentation causing empty `choices` entries - Incorrect indent caused an empty `choices` list after each generated `choices`. - Moved the final `usage` creation step back one indent level to fix this. --- vllm/entrypoints/openai/serving_completion.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 9a8f424a18008..c3c40f2b97d14 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -285,18 +285,18 @@ async def completion_stream_generator( response_json = chunk.model_dump_json(exclude_unset=True) yield f"data: {response_json}\n\n" - if (request.stream_options - and request.stream_options.include_usage): - final_usage_chunk = CompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[], - usage=final_usage, - ) - final_usage_data = (final_usage_chunk.model_dump_json( - exclude_unset=True, exclude_none=True)) - yield f"data: {final_usage_data}\n\n" + if (request.stream_options + and request.stream_options.include_usage): + final_usage_chunk = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[], + usage=final_usage, + ) + final_usage_data = (final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True)) + yield f"data: {final_usage_data}\n\n" except ValueError as e: # TODO: Use a vllm-specific Validation Error From 184e7d9919c6488cb6cc9c9d50670bf20a7160f3 Mon Sep 17 00:00:00 2001 From: Itay Etelis <92247226+Etelis@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:20:02 +0300 Subject: [PATCH 20/21] Fix `serving_chat.py`: - Incorrect indent caused double sending of a chuck resulting in a server crush. - Moved the final `usage` creation step back twp indent levels to fix this. --- vllm/entrypoints/openai/serving_chat.py | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 883567abf415b..f20cfcf9d54cf 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -360,25 +360,25 @@ async def chat_completion_stream_generator( yield f"data: {data}\n\n" finish_reason_sent[i] = True - if (request.stream_options - and request.stream_options.include_usage): - final_usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=previous_num_tokens[i], - total_tokens=prompt_tokens + - previous_num_tokens[i], - ) + if (request.stream_options + and request.stream_options.include_usage): + final_usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=previous_num_tokens[i], + total_tokens=prompt_tokens + + previous_num_tokens[i], + ) - final_usage_chunk = ChatCompletionStreamResponse( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[], - model=model_name, - usage=final_usage) - final_usage_data = (final_usage_chunk.model_dump_json( - exclude_unset=True, exclude_none=True)) - yield f"data: {final_usage_data}\n\n" + final_usage_chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[], + model=model_name, + usage=final_usage) + final_usage_data = (final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True)) + yield f"data: {final_usage_data}\n\n" except ValueError as e: # TODO: Use a vllm-specific Validation Error From e713488b3b33cfebdb5efcac9a4c07080353cd7a Mon Sep 17 00:00:00 2001 From: Itay Etelis <92247226+Etelis@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:29:41 +0300 Subject: [PATCH 21/21] Running format.sh. --- tests/entrypoints/test_openai_server.py | 1 + vllm/entrypoints/openai/serving_chat.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index dd3389862eaf5..d0fe08ae0ddd2 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -582,6 +582,7 @@ async def test_chat_completion_stream_options(server, temperature=0.0, stream=True, stream_options={"include_usage": True}) + async for chunk in stream: if chunk.choices[0].finish_reason is None: assert chunk.usage is None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index f20cfcf9d54cf..f76194671cd9f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -365,8 +365,7 @@ async def chat_completion_stream_generator( final_usage = UsageInfo( prompt_tokens=prompt_tokens, completion_tokens=previous_num_tokens[i], - total_tokens=prompt_tokens + - previous_num_tokens[i], + total_tokens=prompt_tokens + previous_num_tokens[i], ) final_usage_chunk = ChatCompletionStreamResponse(