diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 5e3c5e327ef63..3ce9db0e47eed 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -19,8 +19,8 @@ class EngineCoreRequest: # due to circular imports and typing we have in data.py request_id: str - #NOTE(Nick): I don't think we need to pass prompt here since it should - # always be tokenized? + # NOTE(ywang96): original text prompt is needed when a request is added to + # Detokenizer, but set to None when it is added to EngineCoreClient. prompt: Optional[str] prompt_token_ids: List[int] mm_inputs: Optional[List[Optional["MultiModalKwargs"]]] diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index a4a45ae05ff9e..4ed7f944b058f 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -219,6 +219,9 @@ def _send_input(self, request_type: EngineCoreRequestType, self.input_socket.send_multipart(msg, copy=False) def add_request(self, request: EngineCoreRequest) -> None: + # NOTE: text prompt is not needed in the core engine as it has been + # tokenized. + request.prompt = None self._send_input(EngineCoreRequestType.ADD, request) def abort_requests(self, request_ids: List[str]) -> None: @@ -257,6 +260,9 @@ async def _send_input(self, request_type: EngineCoreRequestType, await self.input_socket.send_multipart(msg, copy=False) async def add_request_async(self, request: EngineCoreRequest) -> None: + # NOTE: text prompt is not needed in the core engine as it has been + # tokenized. + request.prompt = None await self._send_input(EngineCoreRequestType.ADD, request) async def abort_requests_async(self, request_ids: List[str]) -> None: