From 21c31df5e2a06049da23bc8381737d80ce59f9da Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 26 May 2024 19:27:24 +0300
Subject: [PATCH 1/9] Add stream_options in ChatCompletionRequest

Add StreamOptions Class

Add stream_options validation in ChatCompletionRequest
---
 vllm/entrypoints/openai/protocol.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 41e2f77fe56f1..ba0717e0bea51 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -100,6 +100,8 @@ class ResponseFormat(OpenAIBaseModel):
     # type must be "json_object" or "text"
     type: Literal["text", "json_object"]
 
+class StreamOptions(OpenAIBaseModel):
+    include_usage: Optional[bool]
 
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
@@ -119,6 +121,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
                                 le=torch.iinfo(torch.long).max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = None
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     user: Optional[str] = None
@@ -251,7 +254,6 @@ def check_guided_decoding_count(cls, data):
                 "('guided_json', 'guided_regex' or 'guided_choice').")
         return data
 
-
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
@@ -397,6 +399,13 @@ def check_guided_decoding_count(cls, data):
                 "('guided_json', 'guided_regex' or 'guided_choice').")
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        if data.get("stream_options") and not data.get("stream"):
+            raise ValueError("Stream options can only be defined when stream is True.")
+        return data
+
 
 class EmbeddingRequest(BaseModel):
     # Ordered by official OpenAI API documentation

From 9fb7aed33631036862d10a977c27074b0c7d155d Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 30 May 2024 16:49:13 +0300
Subject: [PATCH 2/9] [Feature]: Support `stream_options` option (#4967)

- Introduced the `StreamOptions` class in `OpenAIBaseModel` with an optional `include_usage` attribute.

- Added `stream_options` attribute to the `ChatCompletionRequest` class, defaulting to `None`.
---
 vllm/entrypoints/openai/protocol.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index ba0717e0bea51..94eea271ae501 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -536,6 +536,10 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     choices: List[ChatCompletionResponseStreamChoice]
     usage: Optional[UsageInfo] = Field(default=None)
 
+class UsageInfo(OpenAIBaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
 
 class BatchRequestInput(OpenAIBaseModel):
     """

From 45dcf2dda7cae893e3ddfc6753dcd3e68a1ad4dd Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 30 May 2024 16:56:08 +0300
Subject: [PATCH 3/9] -Implemented a validator to ensure `stream_options` can
 only be set if `stream` is true.

---
 vllm/entrypoints/openai/protocol.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 94eea271ae501..301d6a1cc2b24 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -240,6 +240,12 @@ def logit_bias_logits_processor(
             logits_processors=logits_processors,
         )
 
+    @model_validator(mode='before')
+    def validate_stream_options(cls, values):
+        if values.get('stream_options') is not None and not values.get('stream'):
+            raise ValueError("stream_options can only be set if stream is true")
+        return values
+    
     @model_validator(mode="before")
     @classmethod
     def check_guided_decoding_count(cls, data):

From 8e31211ab5d9aff49d8a6cbb0a2628b982f05e3d Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 30 May 2024 17:05:51 +0300
Subject: [PATCH 4/9] [Feature]: Add optional usage statistics to streaming
 responses

- Updated `chat_completion_stream_generator` to include support for `stream_options` with an `include_usage` flag.
- Modified the initial response generation to conditionally include `usage` field based on `stream_options.include_usage`.
- Enhanced the token-by-token and finish responses to conditionally include `usage` field if `stream_options.include_usage` is set.
- Added a final usage statistics message if `stream_options.include_usage` is set, including prompt tokens and completion tokens.
---
 vllm/entrypoints/openai/serving_chat.py | 39 ++++++++++++++++++-------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7e179362eef8a..9db2ebcc68de5 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -234,6 +234,8 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
+                        if request.stream_options and request.stream_options.include_usage:
+                            chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
 
@@ -261,6 +263,8 @@ async def chat_completion_stream_generator(
                                     choices=[choice_data],
                                     logprobs=None,
                                     model=model_name)
+                                if request.stream_options and request.stream_options.include_usage:
+                                    chunk.usage = None
                                 data = chunk.model_dump_json(
                                     exclude_unset=True)
                                 yield f"data: {data}\n\n"
@@ -302,17 +306,13 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
+                        if request.stream_options and request.stream_options.include_usage:
+                            chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                     else:
                         # Send the finish response for each request.n only once
                         prompt_tokens = len(res.prompt_token_ids)
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=previous_num_tokens[i],
-                            total_tokens=prompt_tokens +
-                            previous_num_tokens[i],
-                        )
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=DeltaMessage(content=delta_text),
@@ -325,12 +325,31 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if final_usage is not None:
-                            chunk.usage = final_usage
-                        data = chunk.model_dump_json(exclude_unset=True,
-                                                     exclude_none=True)
+                        if request.stream_options and request.stream_options.include_usage:
+                            chunk.usage = None
+                        data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True
+
+                    if request.stream_options and request.stream_options.include_usage:
+                        final_usage = UsageInfo(
+                                        prompt_tokens=prompt_tokens,
+                                        completion_tokens=previous_num_tokens[i],
+                                        total_tokens=prompt_tokens +
+                                        previous_num_tokens[i],
+                                    )
+                        
+                        final_usage_chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[],
+                            model=model_name,
+                            usage=final_usage
+                        )
+                        final_usage_data = final_usage_chunk.model_dump_json(exclude_unset=True, exclude_none=True)
+                        yield f"data: {final_usage_data}\n\n"
+                        
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))

From f0334e581f6ae89d76c7c779cc37d9fa0026e86d Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Sun, 2 Jun 2024 21:40:51 +0300
Subject: [PATCH 5/9] Tests added for the following scenarios:

- stream=True, stream_options=None

- stream=True, stream_options={"include_usage": True}

- stream=True, stream_options={"include_usage": False}

- stream=False, stream_options={"include_usage": None}

- stream=False, stream_options={"include_usage": False}

- stream=False, stream_options={"include_usage": True}
---
 tests/entrypoints/test_openai_server.py | 99 +++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 1b04e3205c4b8..f72cc30114033 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -971,5 +971,104 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 17
 
 
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=None
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options=None,
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert "usage" not in chunk
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False},
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert "usage" not in chunk
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+    chunks = []
+    usage_info = None
+    finish_reason_count = 0
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+            chunks.append(chunk.choices[0].text)
+        else:
+            assert chunk.usage is None
+            finish_reason_count += 1
+
+    # The last message should have usage and no choices
+    last_message = await stream.__anext__()
+    assert last_message.usage is not None
+    assert last_message.usage.prompt_tokens > 0
+    assert last_message.usage.completion_tokens > 0
+    assert last_message.usage.total_tokens == last_message.usage.prompt_tokens + last_message.usage.completion_tokens
+    assert last_message.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None},
+        )
+
+    # Test stream=False, stream_options={"include_usage": False}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": False},
+        )
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True},
+        )
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 4a9f80a263cafb0771d10afacf404b669109ff06 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Mon, 3 Jun 2024 18:36:35 +0300
Subject: [PATCH 6/9] Forgot to notate validator as a classmethod.

---
 vllm/entrypoints/openai/protocol.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 301d6a1cc2b24..ad2059c3f517f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -241,6 +241,7 @@ def logit_bias_logits_processor(
         )
 
     @model_validator(mode='before')
+    @classmethod
     def validate_stream_options(cls, values):
         if values.get('stream_options') is not None and not values.get('stream'):
             raise ValueError("stream_options can only be set if stream is true")

From 251f76d131e0a2357c5de75760385d75bc084c04 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 6 Jun 2024 18:20:59 +0300
Subject: [PATCH 7/9] Fixed failing tests based on: - Duplicated Usage
 defenition in protocol.py. - Line too long in several files.

---
 tests/entrypoints/test_openai_server.py |  7 ++++---
 vllm/entrypoints/openai/protocol.py     | 10 +++-------
 vllm/entrypoints/openai/serving_chat.py | 22 +++++++++++++++-------
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 65f382a67fe1f..217b0ec2fc13d 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1347,7 +1347,8 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: str):
+async def test_stream_options(server, client: openai.AsyncOpenAI, 
+                              model_name: str):
     prompt = "What is the capital of France?"
 
     # Test stream=True, stream_options=None
@@ -1390,7 +1391,6 @@ async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: st
         stream_options={"include_usage": True},
     )
     chunks = []
-    usage_info = None
     finish_reason_count = 0
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
@@ -1405,7 +1405,8 @@ async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: st
     assert last_message.usage is not None
     assert last_message.usage.prompt_tokens > 0
     assert last_message.usage.completion_tokens > 0
-    assert last_message.usage.total_tokens == last_message.usage.prompt_tokens + last_message.usage.completion_tokens
+    assert last_message.usage.total_tokens == (last_message.usage.prompt_tokens
+                                         + last_message.usage.completion_tokens)
     assert last_message.choices == []
 
     # Test stream=False, stream_options={"include_usage": None}
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 22e035c8b2280..003fccd4aa82a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -275,10 +275,11 @@ def logit_bias_logits_processor(
     @model_validator(mode='before')
     @classmethod
     def validate_stream_options(cls, values):
-        if values.get('stream_options') is not None and not values.get('stream'):
+        if (values.get('stream_options') is not None and
+            not values.get('stream')):
             raise ValueError("stream_options can only be set if stream is true")
         return values
-    
+
     @model_validator(mode="before")
     @classmethod
     def check_guided_decoding_count(cls, data):
@@ -635,11 +636,6 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     choices: List[ChatCompletionResponseStreamChoice]
     usage: Optional[UsageInfo] = Field(default=None)
 
-class UsageInfo(OpenAIBaseModel):
-    prompt_tokens: int = 0
-    total_tokens: int = 0
-    completion_tokens: Optional[int] = 0
-
 class BatchRequestInput(OpenAIBaseModel):
     """
     The per-line object of the batch input file.
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index f1bf57c13907a..981d1e5854ba4 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -247,7 +247,8 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if request.stream_options and request.stream_options.include_usage:
+                        if (request.stream_options and 
+                            request.stream_options.include_usage):
                             chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
@@ -276,7 +277,8 @@ async def chat_completion_stream_generator(
                                     choices=[choice_data],
                                     logprobs=None,
                                     model=model_name)
-                                if request.stream_options and request.stream_options.include_usage:
+                                if (request.stream_options and 
+                                        request.stream_options.include_usage):
                                     chunk.usage = None
                                 data = chunk.model_dump_json(
                                     exclude_unset=True)
@@ -331,7 +333,8 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if request.stream_options and request.stream_options.include_usage:
+                        if (request.stream_options and 
+                                request.stream_options.include_usage):
                             chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
@@ -350,16 +353,19 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if request.stream_options and request.stream_options.include_usage:
+                        if (request.stream_options and 
+                                request.stream_options.include_usage):
                             chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True
 
-                    if request.stream_options and request.stream_options.include_usage:
+                    if (request.stream_options and 
+                            request.stream_options.include_usage):
                         final_usage = UsageInfo(
                                         prompt_tokens=prompt_tokens,
-                                        completion_tokens=previous_num_tokens[i],
+                                        completion_tokens=previous_num_tokens[i]
+                                        ,
                                         total_tokens=prompt_tokens +
                                         previous_num_tokens[i],
                                     )
@@ -372,7 +378,9 @@ async def chat_completion_stream_generator(
                             model=model_name,
                             usage=final_usage
                         )
-                        final_usage_data = final_usage_chunk.model_dump_json(exclude_unset=True, exclude_none=True)
+                        final_usage_data = (final_usage_chunk.model_dump_json
+                                            (exclude_unset=True, 
+                                             exclude_none=True))
                         yield f"data: {final_usage_data}\n\n"
                         
         except ValueError as e:

From 78fccd53ef8d98462bffbc81dd931447dbb5bcef Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 6 Jun 2024 18:30:15 +0300
Subject: [PATCH 8/9] Fixed failing tests: - Yapf formating.

---
 tests/entrypoints/test_openai_server.py |  7 +++--
 vllm/entrypoints/openai/protocol.py     | 14 ++++++---
 vllm/entrypoints/openai/serving_chat.py | 41 ++++++++++++-------------
 3 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 217b0ec2fc13d..b7d0946ba7244 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1347,7 +1347,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_stream_options(server, client: openai.AsyncOpenAI, 
+async def test_stream_options(server, client: openai.AsyncOpenAI,
                               model_name: str):
     prompt = "What is the capital of France?"
 
@@ -1405,8 +1405,9 @@ async def test_stream_options(server, client: openai.AsyncOpenAI,
     assert last_message.usage is not None
     assert last_message.usage.prompt_tokens > 0
     assert last_message.usage.completion_tokens > 0
-    assert last_message.usage.total_tokens == (last_message.usage.prompt_tokens
-                                         + last_message.usage.completion_tokens)
+    assert last_message.usage.total_tokens == (
+        last_message.usage.prompt_tokens +
+        last_message.usage.completion_tokens)
     assert last_message.choices == []
 
     # Test stream=False, stream_options={"include_usage": None}
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 003fccd4aa82a..6dd0858a4ee7f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -101,9 +101,11 @@ class ResponseFormat(OpenAIBaseModel):
     # type must be "json_object" or "text"
     type: Literal["text", "json_object"]
 
+
 class StreamOptions(OpenAIBaseModel):
     include_usage: Optional[bool]
 
+
 class FunctionDefinition(OpenAIBaseModel):
     name: str
     description: Optional[str] = None
@@ -275,9 +277,10 @@ def logit_bias_logits_processor(
     @model_validator(mode='before')
     @classmethod
     def validate_stream_options(cls, values):
-        if (values.get('stream_options') is not None and
-            not values.get('stream')):
-            raise ValueError("stream_options can only be set if stream is true")
+        if (values.get('stream_options') is not None
+                and not values.get('stream')):
+            raise ValueError(
+                "stream_options can only be set if stream is true")
         return values
 
     @model_validator(mode="before")
@@ -636,6 +639,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     choices: List[ChatCompletionResponseStreamChoice]
     usage: Optional[UsageInfo] = Field(default=None)
 
+
 class BatchRequestInput(OpenAIBaseModel):
     """
     The per-line object of the batch input file.
@@ -656,7 +660,9 @@ class BatchRequestInput(OpenAIBaseModel):
     url: str
 
     # The parameteters of the request.
-    body: Union[ChatCompletionRequest, ]
+    body: Union[
+        ChatCompletionRequest,
+    ]
 
 
 class BatchRequestOutput(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 981d1e5854ba4..883567abf415b 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -247,8 +247,8 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if (request.stream_options and 
-                            request.stream_options.include_usage):
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
                             chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
@@ -277,7 +277,7 @@ async def chat_completion_stream_generator(
                                     choices=[choice_data],
                                     logprobs=None,
                                     model=model_name)
-                                if (request.stream_options and 
+                                if (request.stream_options and
                                         request.stream_options.include_usage):
                                     chunk.usage = None
                                 data = chunk.model_dump_json(
@@ -333,8 +333,8 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if (request.stream_options and 
-                                request.stream_options.include_usage):
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
                             chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
@@ -353,36 +353,33 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if (request.stream_options and 
-                                request.stream_options.include_usage):
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
                             chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True
 
-                    if (request.stream_options and 
-                            request.stream_options.include_usage):
+                    if (request.stream_options
+                            and request.stream_options.include_usage):
                         final_usage = UsageInfo(
-                                        prompt_tokens=prompt_tokens,
-                                        completion_tokens=previous_num_tokens[i]
-                                        ,
-                                        total_tokens=prompt_tokens +
-                                        previous_num_tokens[i],
-                                    )
-                        
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=previous_num_tokens[i],
+                            total_tokens=prompt_tokens +
+                            previous_num_tokens[i],
+                        )
+
                         final_usage_chunk = ChatCompletionStreamResponse(
                             id=request_id,
                             object=chunk_object_type,
                             created=created_time,
                             choices=[],
                             model=model_name,
-                            usage=final_usage
-                        )
-                        final_usage_data = (final_usage_chunk.model_dump_json
-                                            (exclude_unset=True, 
-                                             exclude_none=True))
+                            usage=final_usage)
+                        final_usage_data = (final_usage_chunk.model_dump_json(
+                            exclude_unset=True, exclude_none=True))
                         yield f"data: {final_usage_data}\n\n"
-                        
+
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))

From 527f79cbc4ef28939ca5379f1d1766ff6bb07741 Mon Sep 17 00:00:00 2001
From: Itay Etelis <github@redirecto.me>
Date: Thu, 6 Jun 2024 18:33:50 +0300
Subject: [PATCH 9/9] FIxing failing tests: - yapf in protocol file.

---
 vllm/entrypoints/openai/protocol.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6dd0858a4ee7f..fa33318786b9a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -660,9 +660,7 @@ class BatchRequestInput(OpenAIBaseModel):
     url: str
 
     # The parameteters of the request.
-    body: Union[
-        ChatCompletionRequest,
-    ]
+    body: Union[ChatCompletionRequest, ]
 
 
 class BatchRequestOutput(OpenAIBaseModel):