From d0f4998fde4b13a41ea7c4e8e6bd46cb2b93ad39 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 6 Jun 2024 19:24:39 +0300
Subject: [PATCH 01/21] Add StreamOptions Class

Add stream_options validation in CompletionRequest
---
 vllm/entrypoints/openai/protocol.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 11ac28e758c39..e6d39097860b9 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -96,6 +96,8 @@ class UsageInfo(OpenAIBaseModel):
     total_tokens: int = 0
     completion_tokens: Optional[int] = 0
 
+class StreamOptions(OpenAIBaseModel):
+    include_usage: Optional[bool]
 
 class ResponseFormat(OpenAIBaseModel):
     # type must be "json_object" or "text"
@@ -313,7 +315,6 @@ def check_logprobs(cls, data):
                     "`top_logprobs` must be a value in the interval [0, 20].")
         return data
 
-
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
@@ -332,6 +333,7 @@ class CompletionRequest(OpenAIBaseModel):
                                 le=torch.iinfo(torch.long).max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = None
     suffix: Optional[str] = None
     temperature: Optional[float] = 1.0
     top_p: Optional[float] = 1.0
@@ -468,6 +470,13 @@ def check_logprobs(cls, data):
                               " in the interval [0, 5]."))
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        if data.get("stream_options") and not data.get("stream"):
+            raise ValueError("Stream options can only be defined when stream is True.")
+        return data
+
 
 class EmbeddingRequest(BaseModel):
     # Ordered by official OpenAI API documentation

From 8fee1547546cca452254984b64b1abfe5f55d082 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 6 Jun 2024 19:50:21 +0300
Subject: [PATCH 02/21] - Modified the initial response generation to
 conditionally include `usage` field based on `stream_options.include_usage`.
 - Enhanced the token-by-token and finish responses to conditionally include
 `usage` field if `stream_options.include_usage` is set. - Added a final usage
 statistics message if `stream_options.include_usage` is set, including prompt
 tokens and completion tokens.

---
 vllm/entrypoints/openai/serving_completion.py | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 572878b5527dc..fbbb73a5f1923 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -264,7 +264,8 @@ async def completion_stream_generator(
                         )
                     else:
                         final_usage = None
-                    response_json = CompletionStreamResponse(
+        
+                    chunk= CompletionStreamResponse(
                         id=request_id,
                         created=created_time,
                         model=model_name,
@@ -276,10 +277,27 @@ async def completion_stream_generator(
                                 finish_reason=finish_reason,
                                 stop_reason=stop_reason,
                             )
-                        ],
-                        usage=final_usage,
-                    ).model_dump_json(exclude_unset=True)
+                        ]
+                    )
+                    if (request.stream_options and 
+                            request.stream_options.include_usage):
+                        chunk.usage = None
+
+                    response_json = chunk.model_dump_json(exclude_unset=True)
                     yield f"data: {response_json}\n\n"
+
+                if request.stream_options and request.stream_options.include_usage:
+                    final_usage_chunk = CompletionStreamResponse(
+                        id=request_id,
+                        created=created_time,
+                        model=model_name,
+                        choices=[],
+                        usage=final_usage,
+                    )
+                    final_usage_data = (final_usage_chunk.model_dump_json
+                                        (exclude_unset=True, exclude_none=True))
+                    yield f"data: {final_usage_data}\n\n"
+
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))

From 03c309e56fa5bfb7ac9bc5aa5d486249a5e5c9de Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 6 Jun 2024 19:56:08 +0300
Subject: [PATCH 03/21]  Tests added for the following scenarios:

- stream=True, stream_options=None

- stream=True, stream_options={"include_usage": True}

- stream=True, stream_options={"include_usage": False}

- stream=False, stream_options={"include_usage": None}

- stream=False, stream_options={"include_usage": False}

- stream=False, stream_options={"include_usage": True}
---
 tests/entrypoints/test_openai_server.py | 104 ++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 3721b047e43d9..e74f4121506a7 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1342,6 +1342,110 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
 
+async def test_completion_stream_options(server, client: openai.AsyncOpenAI, 
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=None
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options=None,
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert "usage" not in chunk
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False},
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert "usage" not in chunk
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+            chunks.append(chunk.choices[0].text)
+        else:
+            assert chunk.usage is None
+            finish_reason_count += 1
+
+    # The last message should have usage and no choices
+    last_message = await stream.__anext__()
+    assert last_message.usage is not None
+    assert last_message.usage.prompt_tokens > 0
+    assert last_message.usage.completion_tokens > 0
+    assert (last_message.usage.total_tokens == 
+            last_message.usage.prompt_tokens + 
+            last_message.usage.completion_tokens)
+    assert last_message.choices == []
+
+    # Test stream=False, stream_options=None
+    response = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=False,
+        stream_options=None,
+    )
+    assert response.usage is not None
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.completion_tokens > 0
+    assert (response.usage.total_tokens == 
+            response.usage.prompt_tokens + response.usage.completion_tokens)
+
+    # Test stream=False, stream_options={"include_usage": False}
+    response = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=False,
+        stream_options={"include_usage": False},
+    )
+    assert response.usage is None
+
+    # Test stream=False, stream_options={"include_usage": True}
+    response = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=False,
+        stream_options={"include_usage": True},
+    )
+    assert response.usage is not None
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.completion_tokens > 0
+    assert (response.usage.total_tokens == response.usage.prompt_tokens + 
+            response.usage.completion_tokens)
+
 
 if __name__ == "__main__":
     pytest.main([__file__])

From a592cd8899ef5827344839727ded664a9e7b97c9 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 6 Jun 2024 20:03:16 +0300
Subject: [PATCH 04/21] Fixed issues related to formatting. format.sh.

---
 tests/entrypoints/test_openai_server.py       | 15 ++++++++-------
 vllm/entrypoints/openai/protocol.py           |  6 +++++-
 vllm/entrypoints/openai/serving_completion.py | 18 +++++++++---------
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index e74f4121506a7..304927bd669c1 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1342,7 +1342,8 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
 
-async def test_completion_stream_options(server, client: openai.AsyncOpenAI, 
+
+async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
                                          model_name: str):
     prompt = "What is the capital of France?"
 
@@ -1400,9 +1401,9 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     assert last_message.usage is not None
     assert last_message.usage.prompt_tokens > 0
     assert last_message.usage.completion_tokens > 0
-    assert (last_message.usage.total_tokens == 
-            last_message.usage.prompt_tokens + 
-            last_message.usage.completion_tokens)
+    assert (
+        last_message.usage.total_tokens == last_message.usage.prompt_tokens +
+        last_message.usage.completion_tokens)
     assert last_message.choices == []
 
     # Test stream=False, stream_options=None
@@ -1417,8 +1418,8 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     assert response.usage is not None
     assert response.usage.prompt_tokens > 0
     assert response.usage.completion_tokens > 0
-    assert (response.usage.total_tokens == 
-            response.usage.prompt_tokens + response.usage.completion_tokens)
+    assert (response.usage.total_tokens == response.usage.prompt_tokens +
+            response.usage.completion_tokens)
 
     # Test stream=False, stream_options={"include_usage": False}
     response = await client.completions.create(
@@ -1443,7 +1444,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     assert response.usage is not None
     assert response.usage.prompt_tokens > 0
     assert response.usage.completion_tokens > 0
-    assert (response.usage.total_tokens == response.usage.prompt_tokens + 
+    assert (response.usage.total_tokens == response.usage.prompt_tokens +
             response.usage.completion_tokens)
 
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index e6d39097860b9..868cf8a1988f8 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -96,9 +96,11 @@ class UsageInfo(OpenAIBaseModel):
     total_tokens: int = 0
     completion_tokens: Optional[int] = 0
 
+
 class StreamOptions(OpenAIBaseModel):
     include_usage: Optional[bool]
 
+
 class ResponseFormat(OpenAIBaseModel):
     # type must be "json_object" or "text"
     type: Literal["text", "json_object"]
@@ -315,6 +317,7 @@ def check_logprobs(cls, data):
                     "`top_logprobs` must be a value in the interval [0, 20].")
         return data
 
+
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
@@ -474,7 +477,8 @@ def check_logprobs(cls, data):
     @classmethod
     def validate_stream_options(cls, data):
         if data.get("stream_options") and not data.get("stream"):
-            raise ValueError("Stream options can only be defined when stream is True.")
+            raise ValueError(
+                "Stream options can only be defined when stream is True.")
         return data
 
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index fbbb73a5f1923..9a8f424a18008 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -264,8 +264,8 @@ async def completion_stream_generator(
                         )
                     else:
                         final_usage = None
-        
-                    chunk= CompletionStreamResponse(
+
+                    chunk = CompletionStreamResponse(
                         id=request_id,
                         created=created_time,
                         model=model_name,
@@ -277,16 +277,16 @@ async def completion_stream_generator(
                                 finish_reason=finish_reason,
                                 stop_reason=stop_reason,
                             )
-                        ]
-                    )
-                    if (request.stream_options and 
-                            request.stream_options.include_usage):
+                        ])
+                    if (request.stream_options
+                            and request.stream_options.include_usage):
                         chunk.usage = None
 
                     response_json = chunk.model_dump_json(exclude_unset=True)
                     yield f"data: {response_json}\n\n"
 
-                if request.stream_options and request.stream_options.include_usage:
+                if (request.stream_options
+                        and request.stream_options.include_usage):
                     final_usage_chunk = CompletionStreamResponse(
                         id=request_id,
                         created=created_time,
@@ -294,8 +294,8 @@ async def completion_stream_generator(
                         choices=[],
                         usage=final_usage,
                     )
-                    final_usage_data = (final_usage_chunk.model_dump_json
-                                        (exclude_unset=True, exclude_none=True))
+                    final_usage_data = (final_usage_chunk.model_dump_json(
+                        exclude_unset=True, exclude_none=True))
                     yield f"data: {final_usage_data}\n\n"
 
         except ValueError as e:

From a7319e11b130e2c8bbd7925f6e03c9d4360ab403 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Fri, 7 Jun 2024 16:07:17 +0300
Subject: [PATCH 05/21] FIxing testing file. Noted by DrakLIght there was two
 issues: 1. Notation of `@pytest.mark.asyncio` on the test function. 2.
 Checking on chunk usage on a non exisiting variable.

---
 tests/entrypoints/test_openai_server.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 304927bd669c1..d7138a54fa4a9 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1342,7 +1342,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
 
-
+@pytest.mark.asyncio
 async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
                                          model_name: str):
     prompt = "What is the capital of France?"
@@ -1360,7 +1360,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     async for chunk in stream:
         chunks.append(chunk.choices[0].text)
     assert len(chunks) > 0
-    assert "usage" not in chunk
+    assert all(chunk.usage is None for chunk in chunks)
 
     # Test stream=True, stream_options={"include_usage": False}
     stream = await client.completions.create(
@@ -1375,7 +1375,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     async for chunk in stream:
         chunks.append(chunk.choices[0].text)
     assert len(chunks) > 0
-    assert "usage" not in chunk
+    assert all(chunk.usage is None for chunk in chunks)
 
     # Test stream=True, stream_options={"include_usage": True}
     stream = await client.completions.create(

From 4d33e2878a05d7fe21dbf8fd27fbc70782c05936 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Fri, 7 Jun 2024 16:14:26 +0300
Subject: [PATCH 06/21] Ran formater.sh to fix formatting issues.

Should have done that on prev commit TBH
---
 tests/entrypoints/test_openai_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index d7138a54fa4a9..86fb316c8ecfb 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1342,6 +1342,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
 
+
 @pytest.mark.asyncio
 async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
                                          model_name: str):

From d6ac8910211d5f3a4e2897ddbc8c4b459e54a515 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Fri, 7 Jun 2024 16:58:35 +0300
Subject: [PATCH 07/21] Fixed formating. -- Removed redundent StreamOptions. --
 Formater.sh

---
 tests/entrypoints/test_openai_server.py | 10 +++++++---
 vllm/entrypoints/openai/protocol.py     |  4 ----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 9870e6b16986a..5cc61ad2b8ac9 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1342,12 +1342,14 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
 
+
 @pytest.mark.parametrize(
     "model_name",
     [MODEL_NAME],
 )
-async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI,
-                              model_name: str):
+async def test_chat_completion_stream_options(server,
+                                              client: openai.AsyncOpenAI,
+                                              model_name: str):
     prompt = "What is the capital of France?"
 
     # Test stream=True, stream_options=None
@@ -1442,6 +1444,7 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI
             stream_options={"include_usage": True},
         )
 
+
 @pytest.mark.asyncio
 async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
                                          model_name: str):
@@ -1546,6 +1549,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     assert response.usage.completion_tokens > 0
     assert (response.usage.total_tokens == response.usage.prompt_tokens +
             response.usage.completion_tokens)
-    
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index bfc42b306a36a..9424ccc959d11 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -97,10 +97,6 @@ class UsageInfo(OpenAIBaseModel):
     completion_tokens: Optional[int] = 0
 
 
-class StreamOptions(OpenAIBaseModel):
-    include_usage: Optional[bool]
-
-
 class ResponseFormat(OpenAIBaseModel):
     # type must be "json_object" or "text"
     type: Literal["text", "json_object"]

From 1473a7fcdda5710a77505c32e55227230cccd9cf Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sat, 8 Jun 2024 15:51:08 +0300
Subject: [PATCH 08/21] Tests fixture:

-- Added parametrize in completion stream options.

-- Revised streaming tests as the usage is no longer needed to be asserted.
---
 tests/entrypoints/test_openai_server.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 5cc61ad2b8ac9..005ae2d6c6e69 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -495,7 +495,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
     assert finish_reason_count == 1
     assert chunk.choices[0].finish_reason == "length"
     assert chunk.choices[0].text
-    assert chunk.usage == single_usage
     assert "".join(chunks) == single_output
 
 
@@ -1446,6 +1445,10 @@ async def test_chat_completion_stream_options(server,
 
 
 @pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
 async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
                                          model_name: str):
     prompt = "What is the capital of France?"

From 3d987d042455cf0c119ee5302f4036831690bfb0 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sat, 8 Jun 2024 15:55:16 +0300
Subject: [PATCH 09/21] Fixing testing.

-- single_usage is no longer needed inside test completion streaming.
---
 tests/entrypoints/test_openai_server.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 005ae2d6c6e69..476ec40819762 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -478,8 +478,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
         temperature=0.0,
     )
     single_output = single_completion.choices[0].text
-    single_usage = single_completion.usage
-
     stream = await client.completions.create(model=model_name,
                                              prompt=prompt,
                                              max_tokens=5,
@@ -1342,6 +1340,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 17
 
 
+@pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
     [MODEL_NAME],

From d25d0554c342eaa713626a88cfdfa8d38ebdca5a Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 09:38:25 +0300
Subject: [PATCH 10/21] Tests Related:

-- Resolved concerns raised by DarkLight (Mistake related to client.chat.completions.creat)

-- Resolved issue related to MODEL_NAME,
---
 tests/entrypoints/test_openai_server.py | 60 +++++++++++++------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 476ec40819762..b7a15f1efaf76 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1343,61 +1343,65 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_chat_completion_stream_options(server,
                                               client: openai.AsyncOpenAI,
                                               model_name: str):
-    prompt = "What is the capital of France?"
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
 
     # Test stream=True, stream_options=None
-    stream = await client.completions.create(
+    stream = await client.chat.completions.create(
         model=model_name,
-        prompt=prompt,
-        max_tokens=5,
+        messages=messages,
+        max_tokens=10,
         temperature=0.0,
         stream=True,
         stream_options=None,
     )
     chunks = []
     async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
+        chunks.append(chunk.choices[0].delta.get("content", ""))
     assert len(chunks) > 0
-    assert "usage" not in chunk
+    assert all(chunk.usage is None for chunk in chunks)
 
     # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.completions.create(
+    stream = await client.chat.completions.create(
         model=model_name,
-        prompt=prompt,
-        max_tokens=5,
+        messages=messages,
+        max_tokens=10,
         temperature=0.0,
         stream=True,
         stream_options={"include_usage": False},
     )
     chunks = []
     async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
+        chunks.append(chunk.choices[0].delta.get("content", ""))
     assert len(chunks) > 0
-    assert "usage" not in chunk
+    assert all(chunk.usage is None for chunk in chunks)
 
     # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.completions.create(
+    stream = await client.chat.completions.create(
         model=model_name,
-        prompt=prompt,
-        max_tokens=5,
+        messages=messages,
+        max_tokens=10,
         temperature=0.0,
         stream=True,
         stream_options={"include_usage": True},
     )
     chunks = []
-    finish_reason_count = 0
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
-            chunks.append(chunk.choices[0].text)
+            chunks.append(chunk.choices[0].delta.get("content", ""))
         else:
             assert chunk.usage is None
-            finish_reason_count += 1
 
     # The last message should have usage and no choices
     last_message = await stream.__anext__()
@@ -1411,10 +1415,10 @@ async def test_chat_completion_stream_options(server,
 
     # Test stream=False, stream_options={"include_usage": None}
     with pytest.raises(BadRequestError):
-        await client.completions.create(
+        await client.chat.completions.create(
             model=model_name,
-            prompt=prompt,
-            max_tokens=5,
+            messages=messages,
+            max_tokens=10,
             temperature=0.0,
             stream=False,
             stream_options={"include_usage": None},
@@ -1422,10 +1426,10 @@ async def test_chat_completion_stream_options(server,
 
     # Test stream=False, stream_options={"include_usage": False}
     with pytest.raises(BadRequestError):
-        await client.completions.create(
+        await client.chat.completions.create(
             model=model_name,
-            prompt=prompt,
-            max_tokens=5,
+            messages=messages,
+            max_tokens=10,
             temperature=0.0,
             stream=False,
             stream_options={"include_usage": False},
@@ -1433,10 +1437,10 @@ async def test_chat_completion_stream_options(server,
 
     # Test stream=False, stream_options={"include_usage": True}
     with pytest.raises(BadRequestError):
-        await client.completions.create(
+        await client.chat.completions.create(
             model=model_name,
-            prompt=prompt,
-            max_tokens=5,
+            messages=messages,
+            max_tokens=10,
             temperature=0.0,
             stream=False,
             stream_options={"include_usage": True},
@@ -1446,7 +1450,7 @@ async def test_chat_completion_stream_options(server,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
                                          model_name: str):

From 2e96a592226148c684fa767f7cb2ab36d612f9bc Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 14:57:51 +0300
Subject: [PATCH 11/21] Fixing Testing.

-- Redundent test removed (stream=True, stream_options=None)
---
 tests/entrypoints/test_openai_server.py | 32 +------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index b7a15f1efaf76..c8001905af315 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1356,21 +1356,6 @@ async def test_chat_completion_stream_options(server,
         "content": "What is the capital of France?"
     }]
 
-    # Test stream=True, stream_options=None
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-        stream_options=None,
-    )
-    chunks = []
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].delta.get("content", ""))
-    assert len(chunks) > 0
-    assert all(chunk.usage is None for chunk in chunks)
-
     # Test stream=True, stream_options={"include_usage": False}
     stream = await client.chat.completions.create(
         model=model_name,
@@ -1453,24 +1438,9 @@ async def test_chat_completion_stream_options(server,
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
-                                         model_name: str):
+                                    model_name: str):
     prompt = "What is the capital of France?"
 
-    # Test stream=True, stream_options=None
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options=None,
-    )
-    chunks = []
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-    assert len(chunks) > 0
-    assert all(chunk.usage is None for chunk in chunks)
-
     # Test stream=True, stream_options={"include_usage": False}
     stream = await client.completions.create(
         model=model_name,

From 5c1c0c6f04619a8ad938c94477929078ebc17666 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 15:02:32 +0300
Subject: [PATCH 12/21] Fortmater fixure.

---
 tests/entrypoints/test_openai_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index c8001905af315..3f2ee5c3d481a 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1438,7 +1438,7 @@ async def test_chat_completion_stream_options(server,
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
-                                    model_name: str):
+                                         model_name: str):
     prompt = "What is the capital of France?"
 
     # Test stream=True, stream_options={"include_usage": False}

From 23aa9036693132d339244e5b4d451cbd446df72e Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 16:04:15 +0300
Subject: [PATCH 13/21] Fixed testing. -- Not found.

---
 tests/entrypoints/test_openai_server.py | 41 +++++++++----------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 3f2ee5c3d481a..ca24569e3f3e8 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1363,8 +1363,7 @@ async def test_chat_completion_stream_options(server,
         max_tokens=10,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": False},
-    )
+        stream_options={"include_usage": False})
     chunks = []
     async for chunk in stream:
         chunks.append(chunk.choices[0].delta.get("content", ""))
@@ -1378,8 +1377,7 @@ async def test_chat_completion_stream_options(server,
         max_tokens=10,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": True},
-    )
+        stream_options={"include_usage": True})
     chunks = []
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
@@ -1406,8 +1404,7 @@ async def test_chat_completion_stream_options(server,
             max_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": None},
-        )
+            stream_options={"include_usage": None})
 
     # Test stream=False, stream_options={"include_usage": False}
     with pytest.raises(BadRequestError):
@@ -1417,8 +1414,7 @@ async def test_chat_completion_stream_options(server,
             max_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": False},
-        )
+            stream_options={"include_usage": False})
 
     # Test stream=False, stream_options={"include_usage": True}
     with pytest.raises(BadRequestError):
@@ -1428,8 +1424,7 @@ async def test_chat_completion_stream_options(server,
             max_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": True},
-        )
+            stream_options={"include_usage": True})
 
 
 @pytest.mark.asyncio
@@ -1448,8 +1443,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": False},
-    )
+        stream_options={"include_usage": False})
     chunks = []
     async for chunk in stream:
         chunks.append(chunk.choices[0].text)
@@ -1463,8 +1457,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": True},
-    )
+        stream_options={"include_usage": True})
     chunks = []
     finish_reason_count = 0
     async for chunk in stream:
@@ -1486,14 +1479,12 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     assert last_message.choices == []
 
     # Test stream=False, stream_options=None
-    response = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=False,
-        stream_options=None,
-    )
+    response = await client.completions.create(model=model_name,
+                                               prompt=prompt,
+                                               max_tokens=5,
+                                               temperature=0.0,
+                                               stream=False,
+                                               stream_options=None)
     assert response.usage is not None
     assert response.usage.prompt_tokens > 0
     assert response.usage.completion_tokens > 0
@@ -1507,8 +1498,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
         stream=False,
-        stream_options={"include_usage": False},
-    )
+        stream_options={"include_usage": False})
     assert response.usage is None
 
     # Test stream=False, stream_options={"include_usage": True}
@@ -1518,8 +1508,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
         stream=False,
-        stream_options={"include_usage": True},
-    )
+        stream_options={"include_usage": True})
     assert response.usage is not None
     assert response.usage.prompt_tokens > 0
     assert response.usage.completion_tokens > 0

From 776c73b8323e31b81f215a52beb55df295ce6a2e Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 17:26:43 +0300
Subject: [PATCH 14/21] Reorder tests in server test file to resolve conflicts

- Moved stream options tests before embeddings tests to address conflicts between pytest fixtures.
- This change is in response to a suggestion from DarkLight1337.
- Adjustments made to ensure test suite runs without errors.
---
 tests/entrypoints/test_openai_server.py | 351 ++++++++++++------------
 1 file changed, 175 insertions(+), 176 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index ca24569e3f3e8..ff5779dbb802c 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -547,6 +547,181 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_completion_stream_options(server,
+                                              client: openai.AsyncOpenAI,
+                                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].delta.get("content", ""))
+    assert len(chunks) > 0
+    assert all(chunk.usage is None for chunk in chunks)
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+    chunks = []
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+            chunks.append(chunk.choices[0].delta.get("content", ""))
+        else:
+            assert chunk.usage is None
+
+    # The last message should have usage and no choices
+    last_message = await stream.__anext__()
+    assert last_message.usage is not None
+    assert last_message.usage.prompt_tokens > 0
+    assert last_message.usage.completion_tokens > 0
+    assert last_message.usage.total_tokens == (
+        last_message.usage.prompt_tokens +
+        last_message.usage.completion_tokens)
+    assert last_message.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options={"include_usage": False}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": False})
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert all(chunk.usage is None for chunk in chunks)
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+            chunks.append(chunk.choices[0].text)
+        else:
+            assert chunk.usage is None
+            finish_reason_count += 1
+
+    # The last message should have usage and no choices
+    last_message = await stream.__anext__()
+    assert last_message.usage is not None
+    assert last_message.usage.prompt_tokens > 0
+    assert last_message.usage.completion_tokens > 0
+    assert (
+        last_message.usage.total_tokens == last_message.usage.prompt_tokens +
+        last_message.usage.completion_tokens)
+    assert last_message.choices == []
+
+    # Test stream=False, stream_options=None
+    response = await client.completions.create(model=model_name,
+                                               prompt=prompt,
+                                               max_tokens=5,
+                                               temperature=0.0,
+                                               stream=False,
+                                               stream_options=None)
+    assert response.usage is not None
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.completion_tokens > 0
+    assert (response.usage.total_tokens == response.usage.prompt_tokens +
+            response.usage.completion_tokens)
+
+    # Test stream=False, stream_options={"include_usage": False}
+    response = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=False,
+        stream_options={"include_usage": False})
+    assert response.usage is None
+
+    # Test stream=False, stream_options={"include_usage": True}
+    response = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=False,
+        stream_options={"include_usage": True})
+    assert response.usage is not None
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.completion_tokens > 0
+    assert (response.usage.total_tokens == response.usage.prompt_tokens +
+            response.usage.completion_tokens)
+    
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
@@ -1340,181 +1515,5 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 17
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_chat_completion_stream_options(server,
-                                              client: openai.AsyncOpenAI,
-                                              model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?"
-    }]
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
-    chunks = []
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].delta.get("content", ""))
-    assert len(chunks) > 0
-    assert all(chunk.usage is None for chunk in chunks)
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
-    chunks = []
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-            chunks.append(chunk.choices[0].delta.get("content", ""))
-        else:
-            assert chunk.usage is None
-
-    # The last message should have usage and no choices
-    last_message = await stream.__anext__()
-    assert last_message.usage is not None
-    assert last_message.usage.prompt_tokens > 0
-    assert last_message.usage.completion_tokens > 0
-    assert last_message.usage.total_tokens == (
-        last_message.usage.prompt_tokens +
-        last_message.usage.completion_tokens)
-    assert last_message.choices == []
-
-    # Test stream=False, stream_options={"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options={"include_usage": False}
-    with pytest.raises(BadRequestError):
-        await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": False})
-
-    # Test stream=False, stream_options={"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
-    chunks = []
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-    assert len(chunks) > 0
-    assert all(chunk.usage is None for chunk in chunks)
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
-    chunks = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-            chunks.append(chunk.choices[0].text)
-        else:
-            assert chunk.usage is None
-            finish_reason_count += 1
-
-    # The last message should have usage and no choices
-    last_message = await stream.__anext__()
-    assert last_message.usage is not None
-    assert last_message.usage.prompt_tokens > 0
-    assert last_message.usage.completion_tokens > 0
-    assert (
-        last_message.usage.total_tokens == last_message.usage.prompt_tokens +
-        last_message.usage.completion_tokens)
-    assert last_message.choices == []
-
-    # Test stream=False, stream_options=None
-    response = await client.completions.create(model=model_name,
-                                               prompt=prompt,
-                                               max_tokens=5,
-                                               temperature=0.0,
-                                               stream=False,
-                                               stream_options=None)
-    assert response.usage is not None
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert (response.usage.total_tokens == response.usage.prompt_tokens +
-            response.usage.completion_tokens)
-
-    # Test stream=False, stream_options={"include_usage": False}
-    response = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=False,
-        stream_options={"include_usage": False})
-    assert response.usage is None
-
-    # Test stream=False, stream_options={"include_usage": True}
-    response = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=False,
-        stream_options={"include_usage": True})
-    assert response.usage is not None
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert (response.usage.total_tokens == response.usage.prompt_tokens +
-            response.usage.completion_tokens)
-
-
 if __name__ == "__main__":
     pytest.main([__file__])

From 4c828f4dcba821220098d354da98a381af6a651b Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 17:29:01 +0300
Subject: [PATCH 15/21] Reorder tests in server test file to resolve conflicts

- Moved stream options tests before embeddings tests to address conflicts between pytest fixtures.
- This change is in response to a suggestion from DarkLight1337.
- Adjustments made to ensure test suite runs without errors.
-- Formated code
---
 tests/entrypoints/test_openai_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index ff5779dbb802c..9296148a0930c 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -721,7 +721,8 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
     assert response.usage.completion_tokens > 0
     assert (response.usage.total_tokens == response.usage.prompt_tokens +
             response.usage.completion_tokens)
-    
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     # just test 1 lora hereafter

From 86d6d7a13192a45c8cdf097a1a56f31940f880e0 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 19:33:35 +0300
Subject: [PATCH 16/21] feat(tests): Update test cases for chat and completion
 streaming options

- **Stream with `include_usage: False`**:
  - Added assertions to ensure no chunk contains the `usage` key.
- **Stream with `include_usage: True`**:
  - Modified test logic to verify that every chunk has `usage` as `None` except for the last chunk, which should have `usage` populated.
- **Stream=False configurations**:
  - Added tests to verify that using `stream_options: {"include_usage": None}`, `{"include_usage": False}`, and `{"include_usage": True}` raises a `BadRequestError`.
- Removed redundant test for `stream=False` with `stream_options: {"include_usage": False}` as it overlaps with the error condition checks.
---
 tests/entrypoints/test_openai_server.py | 163 +++++++++---------------
 1 file changed, 63 insertions(+), 100 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 9296148a0930c..e7c7178b38db1 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -547,21 +547,20 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
+import pytest
+import openai
+from openai.error import BadRequestError
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_chat_completion_stream_options(server,
-                                              client: openai.AsyncOpenAI,
-                                              model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?"
-    }]
+async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"}
+    ]
 
     # Test stream=True, stream_options={"include_usage": False}
     stream = await client.chat.completions.create(
@@ -570,12 +569,10 @@ async def test_chat_completion_stream_options(server,
         max_tokens=10,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": False})
-    chunks = []
+        stream_options={"include_usage": False}
+    )
     async for chunk in stream:
-        chunks.append(chunk.choices[0].delta.get("content", ""))
-    assert len(chunks) > 0
-    assert all(chunk.usage is None for chunk in chunks)
+        assert "usage" not in chunk.__dict__
 
     # Test stream=True, stream_options={"include_usage": True}
     stream = await client.chat.completions.create(
@@ -584,24 +581,21 @@ async def test_chat_completion_stream_options(server,
         max_tokens=10,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": True})
-    chunks = []
+        stream_options={"include_usage": True}
+    )
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
-            chunks.append(chunk.choices[0].delta.get("content", ""))
         else:
-            assert chunk.usage is None
-
-    # The last message should have usage and no choices
-    last_message = await stream.__anext__()
-    assert last_message.usage is not None
-    assert last_message.usage.prompt_tokens > 0
-    assert last_message.usage.completion_tokens > 0
-    assert last_message.usage.total_tokens == (
-        last_message.usage.prompt_tokens +
-        last_message.usage.completion_tokens)
-    assert last_message.choices == []
+            assert chunk.usage is None  # Last chunk in stream should have usage as None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
+            assert final_chunk.choices == []
 
     # Test stream=False, stream_options={"include_usage": None}
     with pytest.raises(BadRequestError):
@@ -611,17 +605,8 @@ async def test_chat_completion_stream_options(server,
             max_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options={"include_usage": False}
-    with pytest.raises(BadRequestError):
-        await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": False})
+            stream_options={"include_usage": None}
+        )
 
     # Test stream=False, stream_options={"include_usage": True}
     with pytest.raises(BadRequestError):
@@ -631,16 +616,16 @@ async def test_chat_completion_stream_options(server,
             max_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": True})
+            stream_options={"include_usage": True}
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
-                                         model_name: str):
+async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str):
     prompt = "What is the capital of France?"
 
     # Test stream=True, stream_options={"include_usage": False}
@@ -650,12 +635,10 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": False})
-    chunks = []
+        stream_options={"include_usage": False}
+    )
     async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-    assert len(chunks) > 0
-    assert all(chunk.usage is None for chunk in chunks)
+        assert "usage" not in chunk.__dict__
 
     # Test stream=True, stream_options={"include_usage": True}
     stream = await client.completions.create(
@@ -664,63 +647,43 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": True})
-    chunks = []
-    finish_reason_count = 0
+        stream_options={"include_usage": True}
+    )
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
-            chunks.append(chunk.choices[0].text)
         else:
-            assert chunk.usage is None
-            finish_reason_count += 1
+            assert chunk.usage is None  # The last chunk should have `usage` filled
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
+            assert final_chunk.choices == []
 
-    # The last message should have usage and no choices
-    last_message = await stream.__anext__()
-    assert last_message.usage is not None
-    assert last_message.usage.prompt_tokens > 0
-    assert last_message.usage.completion_tokens > 0
-    assert (
-        last_message.usage.total_tokens == last_message.usage.prompt_tokens +
-        last_message.usage.completion_tokens)
-    assert last_message.choices == []
-
-    # Test stream=False, stream_options=None
-    response = await client.completions.create(model=model_name,
-                                               prompt=prompt,
-                                               max_tokens=5,
-                                               temperature=0.0,
-                                               stream=False,
-                                               stream_options=None)
-    assert response.usage is not None
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert (response.usage.total_tokens == response.usage.prompt_tokens +
-            response.usage.completion_tokens)
-
-    # Test stream=False, stream_options={"include_usage": False}
-    response = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=False,
-        stream_options={"include_usage": False})
-    assert response.usage is None
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None}
+        )
 
     # Test stream=False, stream_options={"include_usage": True}
-    response = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=False,
-        stream_options={"include_usage": True})
-    assert response.usage is not None
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert (response.usage.total_tokens == response.usage.prompt_tokens +
-            response.usage.completion_tokens)
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True}
+        )
 
 
 @pytest.mark.asyncio

From fb6ae02a9871c129b68e4843d737155c6c35999b Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 19:34:50 +0300
Subject: [PATCH 17/21] Formatting related issues resolved.

---
 tests/entrypoints/test_openai_server.py | 80 +++++++++++--------------
 1 file changed, 36 insertions(+), 44 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index e7c7178b38db1..29ff2ed5c7776 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -547,20 +547,21 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
-import pytest
-import openai
-from openai.error import BadRequestError
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
     ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str):
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "What is the capital of France?"}
-    ]
+async def test_chat_completion_stream_options(server,
+                                              client: openai.AsyncOpenAI,
+                                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
 
     # Test stream=True, stream_options={"include_usage": False}
     stream = await client.chat.completions.create(
@@ -569,8 +570,7 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI
         max_tokens=10,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": False}
-    )
+        stream_options={"include_usage": False})
     async for chunk in stream:
         assert "usage" not in chunk.__dict__
 
@@ -581,20 +581,19 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI
         max_tokens=10,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": True}
-    )
+        stream_options={"include_usage": True})
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
         else:
-            assert chunk.usage is None  # Last chunk in stream should have usage as None
+            assert chunk.usage is None
             final_chunk = await stream.__anext__()
             assert final_chunk.usage is not None
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
-            )
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
             assert final_chunk.choices == []
 
     # Test stream=False, stream_options={"include_usage": None}
@@ -605,8 +604,7 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI
             max_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": None}
-        )
+            stream_options={"include_usage": None})
 
     # Test stream=False, stream_options={"include_usage": True}
     with pytest.raises(BadRequestError):
@@ -616,8 +614,7 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI
             max_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": True}
-        )
+            stream_options={"include_usage": True})
 
 
 @pytest.mark.asyncio
@@ -625,7 +622,8 @@ async def test_chat_completion_stream_options(server, client: openai.AsyncOpenAI
     "model_name",
     ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_completion_stream_options(server, client: openai.AsyncOpenAI, model_name: str):
+async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
+                                         model_name: str):
     prompt = "What is the capital of France?"
 
     # Test stream=True, stream_options={"include_usage": False}
@@ -635,8 +633,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, mod
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": False}
-    )
+        stream_options={"include_usage": False})
     async for chunk in stream:
         assert "usage" not in chunk.__dict__
 
@@ -647,43 +644,38 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, mod
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": True}
-    )
+        stream_options={"include_usage": True})
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
         else:
-            assert chunk.usage is None  # The last chunk should have `usage` filled
+            assert chunk.usage is None
             final_chunk = await stream.__anext__()
             assert final_chunk.usage is not None
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
-            )
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
             assert final_chunk.choices == []
 
     # Test stream=False, stream_options={"include_usage": None}
     with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": None}
-        )
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
 
     # Test stream=False, stream_options={"include_usage": True}
     with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": True}
-        )
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
 
 
 @pytest.mark.asyncio

From 22cc139fd7e381b37b202a1944b3967dbc831523 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 9 Jun 2024 21:39:27 +0300
Subject: [PATCH 18/21] fix(tests): Update streaming tests for correct handling
 of 'usage' attribute

- **Stream with `include_usage: False`**:
  - Updated tests to assert that the `usage` attribute is `None` instead of checking its absence in the chunk dictionary. This aligns with the observed behavior where `usage` is present but set to `None`.
---
 tests/entrypoints/test_openai_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 29ff2ed5c7776..dd3389862eaf5 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -572,7 +572,7 @@ async def test_chat_completion_stream_options(server,
         stream=True,
         stream_options={"include_usage": False})
     async for chunk in stream:
-        assert "usage" not in chunk.__dict__
+        assert chunk.usage is None
 
     # Test stream=True, stream_options={"include_usage": True}
     stream = await client.chat.completions.create(
@@ -635,7 +635,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
         stream=True,
         stream_options={"include_usage": False})
     async for chunk in stream:
-        assert "usage" not in chunk.__dict__
+        assert chunk.usage is None
 
     # Test stream=True, stream_options={"include_usage": True}
     stream = await client.completions.create(

From 0d9b6b13988d45e9127a369545b613050112a8a7 Mon Sep 17 00:00:00 2001
From: Itay Etelis <92247226+Etelis@users.noreply.github.com>
Date: Mon, 10 Jun 2024 12:59:44 +0300
Subject: [PATCH 19/21] Fix: Incorrect indentation causing empty `choices`
 entries

- Incorrect indent caused an empty `choices` list after each generated `choices`.
- Moved the final `usage` creation step back one indent level to fix this.
---
 vllm/entrypoints/openai/serving_completion.py | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 9a8f424a18008..c3c40f2b97d14 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -285,18 +285,18 @@ async def completion_stream_generator(
                     response_json = chunk.model_dump_json(exclude_unset=True)
                     yield f"data: {response_json}\n\n"
 
-                if (request.stream_options
-                        and request.stream_options.include_usage):
-                    final_usage_chunk = CompletionStreamResponse(
-                        id=request_id,
-                        created=created_time,
-                        model=model_name,
-                        choices=[],
-                        usage=final_usage,
-                    )
-                    final_usage_data = (final_usage_chunk.model_dump_json(
-                        exclude_unset=True, exclude_none=True))
-                    yield f"data: {final_usage_data}\n\n"
+            if (request.stream_options
+                    and request.stream_options.include_usage):
+                final_usage_chunk = CompletionStreamResponse(
+                    id=request_id,
+                    created=created_time,
+                    model=model_name,
+                    choices=[],
+                    usage=final_usage,
+                )
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
 
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error

From 184e7d9919c6488cb6cc9c9d50670bf20a7160f3 Mon Sep 17 00:00:00 2001
From: Itay Etelis <92247226+Etelis@users.noreply.github.com>
Date: Mon, 10 Jun 2024 13:20:02 +0300
Subject: [PATCH 20/21] Fix `serving_chat.py`:

- Incorrect indent caused double sending of a chuck resulting in a server crush.

- Moved the final `usage` creation step back twp indent levels to fix this.
---
 vllm/entrypoints/openai/serving_chat.py | 36 ++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 883567abf415b..f20cfcf9d54cf 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -360,25 +360,25 @@ async def chat_completion_stream_generator(
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True
 
-                    if (request.stream_options
-                            and request.stream_options.include_usage):
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=previous_num_tokens[i],
-                            total_tokens=prompt_tokens +
-                            previous_num_tokens[i],
-                        )
+            if (request.stream_options
+                    and request.stream_options.include_usage):
+                final_usage = UsageInfo(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=previous_num_tokens[i],
+                    total_tokens=prompt_tokens +
+                    previous_num_tokens[i],
+                )
 
-                        final_usage_chunk = ChatCompletionStreamResponse(
-                            id=request_id,
-                            object=chunk_object_type,
-                            created=created_time,
-                            choices=[],
-                            model=model_name,
-                            usage=final_usage)
-                        final_usage_data = (final_usage_chunk.model_dump_json(
-                            exclude_unset=True, exclude_none=True))
-                        yield f"data: {final_usage_data}\n\n"
+                final_usage_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    object=chunk_object_type,
+                    created=created_time,
+                    choices=[],
+                    model=model_name,
+                    usage=final_usage)
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
 
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error

From e713488b3b33cfebdb5efcac9a4c07080353cd7a Mon Sep 17 00:00:00 2001
From: Itay Etelis <92247226+Etelis@users.noreply.github.com>
Date: Mon, 10 Jun 2024 13:29:41 +0300
Subject: [PATCH 21/21] Running format.sh.

---
 tests/entrypoints/test_openai_server.py | 1 +
 vllm/entrypoints/openai/serving_chat.py | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index dd3389862eaf5..d0fe08ae0ddd2 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -582,6 +582,7 @@ async def test_chat_completion_stream_options(server,
         temperature=0.0,
         stream=True,
         stream_options={"include_usage": True})
+
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index f20cfcf9d54cf..f76194671cd9f 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -365,8 +365,7 @@ async def chat_completion_stream_generator(
                 final_usage = UsageInfo(
                     prompt_tokens=prompt_tokens,
                     completion_tokens=previous_num_tokens[i],
-                    total_tokens=prompt_tokens +
-                    previous_num_tokens[i],
+                    total_tokens=prompt_tokens + previous_num_tokens[i],
                 )
 
                 final_usage_chunk = ChatCompletionStreamResponse(