From 21c31df5e2a06049da23bc8381737d80ce59f9da Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 26 May 2024 19:27:24 +0300 Subject: [PATCH 1/9] Add stream_options in ChatCompletionRequest Add StreamOptions Class Add stream_options validation in ChatCompletionRequest --- vllm/entrypoints/openai/protocol.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 41e2f77fe56f1..ba0717e0bea51 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -100,6 +100,8 @@ class ResponseFormat(OpenAIBaseModel): # type must be "json_object" or "text" type: Literal["text", "json_object"] +class StreamOptions(OpenAIBaseModel): + include_usage: Optional[bool] class ChatCompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation @@ -119,6 +121,7 @@ class ChatCompletionRequest(OpenAIBaseModel): le=torch.iinfo(torch.long).max) stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False + stream_options: Optional[StreamOptions] = None temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 user: Optional[str] = None @@ -251,7 +254,6 @@ def check_guided_decoding_count(cls, data): "('guided_json', 'guided_regex' or 'guided_choice').") return data - class CompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/completions/create @@ -397,6 +399,13 @@ def check_guided_decoding_count(cls, data): "('guided_json', 'guided_regex' or 'guided_choice').") return data + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + if data.get("stream_options") and not data.get("stream"): + raise ValueError("Stream options can only be defined when stream is True.") + return data + class EmbeddingRequest(BaseModel): # Ordered by official OpenAI API documentation From 9fb7aed33631036862d10a977c27074b0c7d155d Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 30 May 2024 16:49:13 +0300 Subject: [PATCH 2/9] [Feature]: Support `stream_options` option (#4967) - Introduced the `StreamOptions` class in `OpenAIBaseModel` with an optional `include_usage` attribute. - Added `stream_options` attribute to the `ChatCompletionRequest` class, defaulting to `None`. --- vllm/entrypoints/openai/protocol.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index ba0717e0bea51..94eea271ae501 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -536,6 +536,10 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): choices: List[ChatCompletionResponseStreamChoice] usage: Optional[UsageInfo] = Field(default=None) +class UsageInfo(OpenAIBaseModel): + prompt_tokens: int = 0 + total_tokens: int = 0 + completion_tokens: Optional[int] = 0 class BatchRequestInput(OpenAIBaseModel): """ From 45dcf2dda7cae893e3ddfc6753dcd3e68a1ad4dd Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 30 May 2024 16:56:08 +0300 Subject: [PATCH 3/9] -Implemented a validator to ensure `stream_options` can only be set if `stream` is true. --- vllm/entrypoints/openai/protocol.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 94eea271ae501..301d6a1cc2b24 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -240,6 +240,12 @@ def logit_bias_logits_processor( logits_processors=logits_processors, ) + @model_validator(mode='before') + def validate_stream_options(cls, values): + if values.get('stream_options') is not None and not values.get('stream'): + raise ValueError("stream_options can only be set if stream is true") + return values + @model_validator(mode="before") @classmethod def check_guided_decoding_count(cls, data): From 8e31211ab5d9aff49d8a6cbb0a2628b982f05e3d Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 30 May 2024 17:05:51 +0300 Subject: [PATCH 4/9] [Feature]: Add optional usage statistics to streaming responses - Updated `chat_completion_stream_generator` to include support for `stream_options` with an `include_usage` flag. - Modified the initial response generation to conditionally include `usage` field based on `stream_options.include_usage`. - Enhanced the token-by-token and finish responses to conditionally include `usage` field if `stream_options.include_usage` is set. - Added a final usage statistics message if `stream_options.include_usage` is set, including prompt tokens and completion tokens. --- vllm/entrypoints/openai/serving_chat.py | 39 ++++++++++++++++++------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7e179362eef8a..9db2ebcc68de5 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -234,6 +234,8 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) + if request.stream_options and request.stream_options.include_usage: + chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" @@ -261,6 +263,8 @@ async def chat_completion_stream_generator( choices=[choice_data], logprobs=None, model=model_name) + if request.stream_options and request.stream_options.include_usage: + chunk.usage = None data = chunk.model_dump_json( exclude_unset=True) yield f"data: {data}\n\n" @@ -302,17 +306,13 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) + if request.stream_options and request.stream_options.include_usage: + chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" else: # Send the finish response for each request.n only once prompt_tokens = len(res.prompt_token_ids) - final_usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=previous_num_tokens[i], - total_tokens=prompt_tokens + - previous_num_tokens[i], - ) choice_data = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage(content=delta_text), @@ -325,12 +325,31 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - if final_usage is not None: - chunk.usage = final_usage - data = chunk.model_dump_json(exclude_unset=True, - exclude_none=True) + if request.stream_options and request.stream_options.include_usage: + chunk.usage = None + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" finish_reason_sent[i] = True + + if request.stream_options and request.stream_options.include_usage: + final_usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=previous_num_tokens[i], + total_tokens=prompt_tokens + + previous_num_tokens[i], + ) + + final_usage_chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[], + model=model_name, + usage=final_usage + ) + final_usage_data = final_usage_chunk.model_dump_json(exclude_unset=True, exclude_none=True) + yield f"data: {final_usage_data}\n\n" + except ValueError as e: # TODO: Use a vllm-specific Validation Error data = self.create_streaming_error_response(str(e)) From f0334e581f6ae89d76c7c779cc37d9fa0026e86d Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Sun, 2 Jun 2024 21:40:51 +0300 Subject: [PATCH 5/9] Tests added for the following scenarios: - stream=True, stream_options=None - stream=True, stream_options={"include_usage": True} - stream=True, stream_options={"include_usage": False} - stream=False, stream_options={"include_usage": None} - stream=False, stream_options={"include_usage": False} - stream=False, stream_options={"include_usage": True} --- tests/entrypoints/test_openai_server.py | 99 +++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 1b04e3205c4b8..f72cc30114033 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -971,5 +971,104 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 17 +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: str): + prompt = "What is the capital of France?" + + # Test stream=True, stream_options=None + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options=None, + ) + chunks = [] + async for chunk in stream: + chunks.append(chunk.choices[0].text) + assert len(chunks) > 0 + assert "usage" not in chunk + + # Test stream=True, stream_options={"include_usage": False} + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={"include_usage": False}, + ) + chunks = [] + async for chunk in stream: + chunks.append(chunk.choices[0].text) + assert len(chunks) > 0 + assert "usage" not in chunk + + # Test stream=True, stream_options={"include_usage": True} + stream = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={"include_usage": True}, + ) + chunks = [] + usage_info = None + finish_reason_count = 0 + async for chunk in stream: + if chunk.choices[0].finish_reason is None: + assert chunk.usage is None + chunks.append(chunk.choices[0].text) + else: + assert chunk.usage is None + finish_reason_count += 1 + + # The last message should have usage and no choices + last_message = await stream.__anext__() + assert last_message.usage is not None + assert last_message.usage.prompt_tokens > 0 + assert last_message.usage.completion_tokens > 0 + assert last_message.usage.total_tokens == last_message.usage.prompt_tokens + last_message.usage.completion_tokens + assert last_message.choices == [] + + # Test stream=False, stream_options={"include_usage": None} + with pytest.raises(BadRequestError): + await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": None}, + ) + + # Test stream=False, stream_options={"include_usage": False} + with pytest.raises(BadRequestError): + await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": False}, + ) + + # Test stream=False, stream_options={"include_usage": True} + with pytest.raises(BadRequestError): + await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}, + ) + + if __name__ == "__main__": pytest.main([__file__]) From 4a9f80a263cafb0771d10afacf404b669109ff06 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Mon, 3 Jun 2024 18:36:35 +0300 Subject: [PATCH 6/9] Forgot to notate validator as a classmethod. --- vllm/entrypoints/openai/protocol.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 301d6a1cc2b24..ad2059c3f517f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -241,6 +241,7 @@ def logit_bias_logits_processor( ) @model_validator(mode='before') + @classmethod def validate_stream_options(cls, values): if values.get('stream_options') is not None and not values.get('stream'): raise ValueError("stream_options can only be set if stream is true") From 251f76d131e0a2357c5de75760385d75bc084c04 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 6 Jun 2024 18:20:59 +0300 Subject: [PATCH 7/9] Fixed failing tests based on: - Duplicated Usage defenition in protocol.py. - Line too long in several files. --- tests/entrypoints/test_openai_server.py | 7 ++++--- vllm/entrypoints/openai/protocol.py | 10 +++------- vllm/entrypoints/openai/serving_chat.py | 22 +++++++++++++++------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 65f382a67fe1f..217b0ec2fc13d 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1347,7 +1347,8 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME], ) -async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: str): +async def test_stream_options(server, client: openai.AsyncOpenAI, + model_name: str): prompt = "What is the capital of France?" # Test stream=True, stream_options=None @@ -1390,7 +1391,6 @@ async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: st stream_options={"include_usage": True}, ) chunks = [] - usage_info = None finish_reason_count = 0 async for chunk in stream: if chunk.choices[0].finish_reason is None: @@ -1405,7 +1405,8 @@ async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: st assert last_message.usage is not None assert last_message.usage.prompt_tokens > 0 assert last_message.usage.completion_tokens > 0 - assert last_message.usage.total_tokens == last_message.usage.prompt_tokens + last_message.usage.completion_tokens + assert last_message.usage.total_tokens == (last_message.usage.prompt_tokens + + last_message.usage.completion_tokens) assert last_message.choices == [] # Test stream=False, stream_options={"include_usage": None} diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 22e035c8b2280..003fccd4aa82a 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -275,10 +275,11 @@ def logit_bias_logits_processor( @model_validator(mode='before') @classmethod def validate_stream_options(cls, values): - if values.get('stream_options') is not None and not values.get('stream'): + if (values.get('stream_options') is not None and + not values.get('stream')): raise ValueError("stream_options can only be set if stream is true") return values - + @model_validator(mode="before") @classmethod def check_guided_decoding_count(cls, data): @@ -635,11 +636,6 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): choices: List[ChatCompletionResponseStreamChoice] usage: Optional[UsageInfo] = Field(default=None) -class UsageInfo(OpenAIBaseModel): - prompt_tokens: int = 0 - total_tokens: int = 0 - completion_tokens: Optional[int] = 0 - class BatchRequestInput(OpenAIBaseModel): """ The per-line object of the batch input file. diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index f1bf57c13907a..981d1e5854ba4 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -247,7 +247,8 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - if request.stream_options and request.stream_options.include_usage: + if (request.stream_options and + request.stream_options.include_usage): chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" @@ -276,7 +277,8 @@ async def chat_completion_stream_generator( choices=[choice_data], logprobs=None, model=model_name) - if request.stream_options and request.stream_options.include_usage: + if (request.stream_options and + request.stream_options.include_usage): chunk.usage = None data = chunk.model_dump_json( exclude_unset=True) @@ -331,7 +333,8 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - if request.stream_options and request.stream_options.include_usage: + if (request.stream_options and + request.stream_options.include_usage): chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" @@ -350,16 +353,19 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - if request.stream_options and request.stream_options.include_usage: + if (request.stream_options and + request.stream_options.include_usage): chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" finish_reason_sent[i] = True - if request.stream_options and request.stream_options.include_usage: + if (request.stream_options and + request.stream_options.include_usage): final_usage = UsageInfo( prompt_tokens=prompt_tokens, - completion_tokens=previous_num_tokens[i], + completion_tokens=previous_num_tokens[i] + , total_tokens=prompt_tokens + previous_num_tokens[i], ) @@ -372,7 +378,9 @@ async def chat_completion_stream_generator( model=model_name, usage=final_usage ) - final_usage_data = final_usage_chunk.model_dump_json(exclude_unset=True, exclude_none=True) + final_usage_data = (final_usage_chunk.model_dump_json + (exclude_unset=True, + exclude_none=True)) yield f"data: {final_usage_data}\n\n" except ValueError as e: From 78fccd53ef8d98462bffbc81dd931447dbb5bcef Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 6 Jun 2024 18:30:15 +0300 Subject: [PATCH 8/9] Fixed failing tests: - Yapf formating. --- tests/entrypoints/test_openai_server.py | 7 +++-- vllm/entrypoints/openai/protocol.py | 14 ++++++--- vllm/entrypoints/openai/serving_chat.py | 41 ++++++++++++------------- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 217b0ec2fc13d..b7d0946ba7244 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1347,7 +1347,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME], ) -async def test_stream_options(server, client: openai.AsyncOpenAI, +async def test_stream_options(server, client: openai.AsyncOpenAI, model_name: str): prompt = "What is the capital of France?" @@ -1405,8 +1405,9 @@ async def test_stream_options(server, client: openai.AsyncOpenAI, assert last_message.usage is not None assert last_message.usage.prompt_tokens > 0 assert last_message.usage.completion_tokens > 0 - assert last_message.usage.total_tokens == (last_message.usage.prompt_tokens - + last_message.usage.completion_tokens) + assert last_message.usage.total_tokens == ( + last_message.usage.prompt_tokens + + last_message.usage.completion_tokens) assert last_message.choices == [] # Test stream=False, stream_options={"include_usage": None} diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 003fccd4aa82a..6dd0858a4ee7f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -101,9 +101,11 @@ class ResponseFormat(OpenAIBaseModel): # type must be "json_object" or "text" type: Literal["text", "json_object"] + class StreamOptions(OpenAIBaseModel): include_usage: Optional[bool] + class FunctionDefinition(OpenAIBaseModel): name: str description: Optional[str] = None @@ -275,9 +277,10 @@ def logit_bias_logits_processor( @model_validator(mode='before') @classmethod def validate_stream_options(cls, values): - if (values.get('stream_options') is not None and - not values.get('stream')): - raise ValueError("stream_options can only be set if stream is true") + if (values.get('stream_options') is not None + and not values.get('stream')): + raise ValueError( + "stream_options can only be set if stream is true") return values @model_validator(mode="before") @@ -636,6 +639,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): choices: List[ChatCompletionResponseStreamChoice] usage: Optional[UsageInfo] = Field(default=None) + class BatchRequestInput(OpenAIBaseModel): """ The per-line object of the batch input file. @@ -656,7 +660,9 @@ class BatchRequestInput(OpenAIBaseModel): url: str # The parameteters of the request. - body: Union[ChatCompletionRequest, ] + body: Union[ + ChatCompletionRequest, + ] class BatchRequestOutput(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 981d1e5854ba4..883567abf415b 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -247,8 +247,8 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - if (request.stream_options and - request.stream_options.include_usage): + if (request.stream_options + and request.stream_options.include_usage): chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" @@ -277,7 +277,7 @@ async def chat_completion_stream_generator( choices=[choice_data], logprobs=None, model=model_name) - if (request.stream_options and + if (request.stream_options and request.stream_options.include_usage): chunk.usage = None data = chunk.model_dump_json( @@ -333,8 +333,8 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - if (request.stream_options and - request.stream_options.include_usage): + if (request.stream_options + and request.stream_options.include_usage): chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" @@ -353,36 +353,33 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - if (request.stream_options and - request.stream_options.include_usage): + if (request.stream_options + and request.stream_options.include_usage): chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" finish_reason_sent[i] = True - if (request.stream_options and - request.stream_options.include_usage): + if (request.stream_options + and request.stream_options.include_usage): final_usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=previous_num_tokens[i] - , - total_tokens=prompt_tokens + - previous_num_tokens[i], - ) - + prompt_tokens=prompt_tokens, + completion_tokens=previous_num_tokens[i], + total_tokens=prompt_tokens + + previous_num_tokens[i], + ) + final_usage_chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, created=created_time, choices=[], model=model_name, - usage=final_usage - ) - final_usage_data = (final_usage_chunk.model_dump_json - (exclude_unset=True, - exclude_none=True)) + usage=final_usage) + final_usage_data = (final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True)) yield f"data: {final_usage_data}\n\n" - + except ValueError as e: # TODO: Use a vllm-specific Validation Error data = self.create_streaming_error_response(str(e)) From 527f79cbc4ef28939ca5379f1d1766ff6bb07741 Mon Sep 17 00:00:00 2001 From: Itay Etelis Date: Thu, 6 Jun 2024 18:33:50 +0300 Subject: [PATCH 9/9] FIxing failing tests: - yapf in protocol file. --- vllm/entrypoints/openai/protocol.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6dd0858a4ee7f..fa33318786b9a 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -660,9 +660,7 @@ class BatchRequestInput(OpenAIBaseModel): url: str # The parameteters of the request. - body: Union[ - ChatCompletionRequest, - ] + body: Union[ChatCompletionRequest, ] class BatchRequestOutput(OpenAIBaseModel):