From 5d61d13db0c90e7abe66ef52aa964262e23773bf Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 16:09:32 +0000 Subject: [PATCH 01/13] better shutdown --- vllm/entrypoints/openai/api_server.py | 41 +++++++-------------------- vllm/v1/engine/async_llm.py | 14 +++++++++ 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 094cc15a317e9..1bcbdffaeac8b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -133,32 +133,20 @@ async def build_async_engine_client_from_engine_args( Returns the Client or None if the creation failed. """ - # Fall back - # TODO: fill out feature matrix. + # AsyncLLMEngine. if (MQLLMEngineClient.is_unsupported_config(engine_args) or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): - engine_config = engine_args.create_engine_config( - UsageContext.OPENAI_API_SERVER) - uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), - "uses_ray", False) - - build_engine = partial(AsyncLLMEngine.from_engine_args, - engine_args=engine_args, - engine_config=engine_config, - usage_context=UsageContext.OPENAI_API_SERVER) - if uses_ray: - # Must run in main thread with ray for its signal handlers to work - engine_client = build_engine() - else: - engine_client = await asyncio.get_running_loop().run_in_executor( - None, build_engine) - yield engine_client - if hasattr(engine_client, "shutdown"): - engine_client.shutdown() - return + engine_client: Optional[EngineClient] = None + try: + engine_client = AsyncLLMEngine.from_engine_args( + engine_args=engine_args, + usage_context=UsageContext.OPENAI_API_SERVER) + finally: + if engine_client and hasattr(engine_client, "shutdown"): + engine_client.shutdown() - # Otherwise, use the multiprocessing AsyncLLMEngine. + # MQLLMEngine. else: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: # Make TemporaryDirectory for prometheus multiprocessing @@ -737,15 +725,6 @@ def signal_handler(*_) -> None: signal.signal(signal.SIGTERM, signal_handler) - # The child processes will send SIGQUIT to this process when - # any error happens. This process then clean up the whole tree. - # TODO(rob): move this into AsyncLLM.__init__ once we remove - # the context manager below. - def sigquit_handler(signum, frame): - kill_process_tree(os.getpid()) - - signal.signal(signal.SIGQUIT, sigquit_handler) - async with build_async_engine_client(args) as engine_client: app = build_app(args) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index da3da6dad6436..53666d3898f04 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,4 +1,5 @@ import asyncio +import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -23,6 +24,8 @@ logger = init_logger(__name__) +class AsyncEngineDeadError(RuntimeError): + pass class AsyncLLM(EngineClient): @@ -38,6 +41,17 @@ def __init__( log_requests: bool = True, start_engine_loop: bool = True, ) -> None: + + # The child processes will send SIGQUIT to this process when + # any error happens. We raise an error if this happens + # so that we can shutdown properly. + def sigquit_handler(signum, frame): + raise AsyncEngineDeadError( + "AsyncLLM got a SIGQUIT from background process. " + "see stack trace for root cause issue.") + + signal.signal(signal.SIGQUIT, sigquit_handler) + assert start_engine_loop self.log_requests = log_requests From 8f22b4be637dea120297f377a613e2d898d2ed4b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 17:33:26 +0000 Subject: [PATCH 02/13] shutdown on error --- vllm/engine/protocol.py | 5 ----- vllm/entrypoints/openai/api_server.py | 3 ++- vllm/v1/engine/async_llm.py | 32 +++++++++++++++------------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index a066836b92708..b2a5cc17ead64 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -29,11 +29,6 @@ class EngineClient(ABC): def is_running(self) -> bool: ... - @property - @abstractmethod - def is_stopped(self) -> bool: - ... - @property @abstractmethod def errored(self) -> bool: diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1bcbdffaeac8b..bac72d87376da 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -68,7 +68,7 @@ from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address, kill_process_tree, set_ulimit) + is_valid_ipv6_address, set_ulimit) from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -142,6 +142,7 @@ async def build_async_engine_client_from_engine_args( engine_client = AsyncLLMEngine.from_engine_args( engine_args=engine_args, usage_context=UsageContext.OPENAI_API_SERVER) + yield engine_client finally: if engine_client and hasattr(engine_client, "shutdown"): engine_client.shutdown() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 53666d3898f04..84793670f484d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,4 +1,5 @@ import asyncio +import os import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union @@ -17,6 +18,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext +from vllm.utils import kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor @@ -24,9 +26,11 @@ logger = init_logger(__name__) -class AsyncEngineDeadError(RuntimeError): + +class EngineDeadError(RuntimeError): pass + class AsyncLLM(EngineClient): def __init__( @@ -41,14 +45,18 @@ def __init__( log_requests: bool = True, start_engine_loop: bool = True, ) -> None: - - # The child processes will send SIGQUIT to this process when - # any error happens. We raise an error if this happens - # so that we can shutdown properly. + + # Flag for API server to know if we should shutdown. + self._errored = False + + # The child processes will send SIGQUIT when unrecoverable + # errors happen. We kill the process tree here so that the + # stack trace is very evident. def sigquit_handler(signum, frame): - raise AsyncEngineDeadError( - "AsyncLLM got a SIGQUIT from background process. " - "see stack trace for root cause issue.") + logger.fatal( + "AsyncLLM got SIGQUIT from worker processes, shutting " + "down. See stack trace above for root cause issue.") + kill_process_tree(os.getpid()) signal.signal(signal.SIGQUIT, sigquit_handler) @@ -255,7 +263,7 @@ async def generate( # If the request is disconnected by the client, the # generate() task will be canceled. So, we abort the # request if we end up here. - except asyncio.CancelledError: + except asyncio.exceptions.CancelledError: await self.abort(request_id) raise @@ -354,14 +362,10 @@ async def stop_profile(self) -> None: def is_running(self) -> bool: return True - @property - def is_stopped(self) -> bool: - return False - @property def errored(self) -> bool: return False @property def dead_error(self) -> BaseException: - return Exception() # TODO: implement + return Exception() From 577fe7f8a92726d02f8a17049de471df9001005b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 17:34:44 +0000 Subject: [PATCH 03/13] remove EngineDeadError --- vllm/v1/engine/async_llm.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 84793670f484d..8a4a6f8a11841 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,11 +26,6 @@ logger = init_logger(__name__) - -class EngineDeadError(RuntimeError): - pass - - class AsyncLLM(EngineClient): def __init__( @@ -46,9 +41,6 @@ def __init__( start_engine_loop: bool = True, ) -> None: - # Flag for API server to know if we should shutdown. - self._errored = False - # The child processes will send SIGQUIT when unrecoverable # errors happen. We kill the process tree here so that the # stack trace is very evident. From 3ded2a6f1767e3429d74e61d36e390daf9f9c708 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 17:36:06 +0000 Subject: [PATCH 04/13] remove unnessary changes --- vllm/engine/protocol.py | 5 +++++ vllm/v1/engine/async_llm.py | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index b2a5cc17ead64..a066836b92708 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -29,6 +29,11 @@ class EngineClient(ABC): def is_running(self) -> bool: ... + @property + @abstractmethod + def is_stopped(self) -> bool: + ... + @property @abstractmethod def errored(self) -> bool: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 8a4a6f8a11841..4cc38f25a5c99 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -354,10 +354,14 @@ async def stop_profile(self) -> None: def is_running(self) -> bool: return True + @property + def is_stopped(self) -> bool: + return False + @property def errored(self) -> bool: return False @property def dead_error(self) -> BaseException: - return Exception() + return Exception() # TODO: implement From c4d878250b6c1e0007e4a209a449eb404a7164a1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 17:36:42 +0000 Subject: [PATCH 05/13] formatting --- vllm/v1/engine/async_llm.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 4cc38f25a5c99..059a4ce444114 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,6 +26,7 @@ logger = init_logger(__name__) + class AsyncLLM(EngineClient): def __init__( @@ -354,8 +355,8 @@ async def stop_profile(self) -> None: def is_running(self) -> bool: return True - @property - def is_stopped(self) -> bool: + @property + def is_stopped(self) -> bool: return False @property @@ -364,4 +365,4 @@ def errored(self) -> bool: @property def dead_error(self) -> BaseException: - return Exception() # TODO: implement + return Exception() # TODO: implement From 39b28b940c87e8827764df6779dd56d5d628585e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 17:37:48 +0000 Subject: [PATCH 06/13] spurious change --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 059a4ce444114..bb1d22023ec14 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -256,7 +256,7 @@ async def generate( # If the request is disconnected by the client, the # generate() task will be canceled. So, we abort the # request if we end up here. - except asyncio.exceptions.CancelledError: + except asyncio.CancelledError: await self.abort(request_id) raise From 43cf6e77f81167ec4541c5cd6fdadec1660e94ad Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 17:55:47 +0000 Subject: [PATCH 07/13] update --- vllm/v1/engine/async_llm.py | 38 ++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index bb1d22023ec14..d5c46be34d067 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,6 +1,7 @@ import asyncio import os import signal +import traceback from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -45,6 +46,10 @@ def __init__( # The child processes will send SIGQUIT when unrecoverable # errors happen. We kill the process tree here so that the # stack trace is very evident. + # TODO: rather than killing the main process, we should + # figure out how to raise an AsyncEngineDeadError and + # handle at the API server level so we can return a better + # error code to the clients calling VLLM. def sigquit_handler(signum, frame): logger.fatal( "AsyncLLM got SIGQUIT from worker processes, shutting " @@ -226,6 +231,7 @@ async def generate( if self.output_handler is None: self.output_handler = asyncio.create_task( self._run_output_handler()) + self.output_handler.add_done_callback(self._output_handler_cb) q = await self.add_request( request_id, @@ -275,23 +281,29 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" - try: - while True: - # 1) Pull EngineCoreOutput from the EngineCore. - outputs = await self.engine_core.get_output_async() + while True: + # 1) Pull EngineCoreOutput from the EngineCore. + outputs = await self.engine_core.get_output_async() - # 2) Detokenize based on the output. - request_outputs, reqs_to_abort = self.detokenizer.step(outputs) + # 2) Detokenize based on the output. + request_outputs, reqs_to_abort = self.detokenizer.step(outputs) - # 3) Put the RequestOutputs into the per-request queues. - self._process_request_outputs(request_outputs) + # 3) Put the RequestOutputs into the per-request queues. + self._process_request_outputs(request_outputs) - # 4) Abort any requests that finished due to stop strings. - await self.engine_core.abort_requests_async(reqs_to_abort) + # 4) Abort any requests that finished due to stop strings. + await self.engine_core.abort_requests_async(reqs_to_abort) + raise ValueError("Hello my name is") - except BaseException as e: - logger.error(e) - raise e + def _output_handler_cb(self, task: asyncio.Task): + try: + task.result() + except asyncio.CancelledError: + pass + except Exception: + logger.error("Exception in output handler: %s", + traceback.format_exc()) + kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: """Abort RequestId in self, detokenizer, and engine core.""" From 5450350b28c6618a13c59a7a6ca603aa65613ea6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 19:12:44 +0000 Subject: [PATCH 08/13] cleanup --- vllm/v1/engine/async_llm.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d5c46be34d067..4efab6bcc9dd8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,7 +1,6 @@ import asyncio import os import signal -import traceback from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -19,7 +18,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import kill_process_tree +from vllm.utils import get_exception_traceback, kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor @@ -231,7 +230,6 @@ async def generate( if self.output_handler is None: self.output_handler = asyncio.create_task( self._run_output_handler()) - self.output_handler.add_done_callback(self._output_handler_cb) q = await self.add_request( request_id, @@ -281,28 +279,24 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" - while True: - # 1) Pull EngineCoreOutput from the EngineCore. - outputs = await self.engine_core.get_output_async() + try: + while True: + # 1) Pull EngineCoreOutput from the EngineCore. + outputs = await self.engine_core.get_output_async() - # 2) Detokenize based on the output. - request_outputs, reqs_to_abort = self.detokenizer.step(outputs) + # 2) Detokenize based on the output. + request_outputs, reqs_to_abort = self.detokenizer.step(outputs) - # 3) Put the RequestOutputs into the per-request queues. - self._process_request_outputs(request_outputs) + # 3) Put the RequestOutputs into the per-request queues. + self._process_request_outputs(request_outputs) - # 4) Abort any requests that finished due to stop strings. - await self.engine_core.abort_requests_async(reqs_to_abort) - raise ValueError("Hello my name is") + # 4) Abort any requests that finished due to stop strings. + await self.engine_core.abort_requests_async(reqs_to_abort) + raise ValueError("my error!") - def _output_handler_cb(self, task: asyncio.Task): - try: - task.result() - except asyncio.CancelledError: - pass except Exception: - logger.error("Exception in output handler: %s", - traceback.format_exc()) + traceback = get_exception_traceback() + logger.error("EngineCore hit an exception: %s", traceback) kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: From 7c5b56448710e6894c59ac774201c537e97f633a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 28 Dec 2024 19:17:32 +0000 Subject: [PATCH 09/13] formatted --- vllm/v1/engine/async_llm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 4efab6bcc9dd8..44c06d9b8a023 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -292,7 +292,6 @@ async def _run_output_handler(self): # 4) Abort any requests that finished due to stop strings. await self.engine_core.abort_requests_async(reqs_to_abort) - raise ValueError("my error!") except Exception: traceback = get_exception_traceback() From acbe6e36c7343fdf24b9f72ea3b94805131ab01f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 29 Dec 2024 13:11:31 +0000 Subject: [PATCH 10/13] use logger.error directly --- vllm/engine/llm_engine.py | 1 + vllm/entrypoints/openai/serving_completion.py | 3 +++ vllm/transformers_utils/tokenizer_group/tokenizer_group.py | 2 ++ vllm/v1/engine/async_llm.py | 7 +++---- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1db3e59ff3bae..aa7de60725dce 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -616,6 +616,7 @@ def _add_processed_request( decoder_inputs = processed_inputs encoder_inputs = None + print(f"{decoder_inputs=}") seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, lora_request, prompt_adapter_request) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index aaad7b8c7f44c..da9e16fa427cc 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -105,6 +105,7 @@ async def create_completion( tokenizer = await self.engine_client.get_tokenizer(lora_request) + print(f"{request.prompt=}") request_prompts, engine_prompts = await self._preprocess_completion( request, tokenizer, @@ -112,6 +113,8 @@ async def create_completion( truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) + print(f"{request_prompts=}") + print(f"{engine_prompts=}") except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index 95a8f7098bbac..c7249efa6d52f 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -57,7 +57,9 @@ def encode(self, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None) -> List[int]: tokenizer = self.get_lora_tokenizer(lora_request) + print(f"{prompt=}") ret = tokenizer.encode(prompt) + print(f"{ret=}") self._raise_if_input_too_long(ret, lora_request) return ret diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 44c06d9b8a023..39ea9aed829f9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -18,7 +18,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import get_exception_traceback, kill_process_tree +from vllm.utils import kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor @@ -293,9 +293,8 @@ async def _run_output_handler(self): # 4) Abort any requests that finished due to stop strings. await self.engine_core.abort_requests_async(reqs_to_abort) - except Exception: - traceback = get_exception_traceback() - logger.error("EngineCore hit an exception: %s", traceback) + except Exception as e: + logger.error(e) kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: From 5087194c38e68d7e861d00893f010b8c11a239b8 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 29 Dec 2024 13:21:13 +0000 Subject: [PATCH 11/13] passing --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 39ea9aed829f9..bc1bd6faca8a9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -294,7 +294,7 @@ async def _run_output_handler(self): await self.engine_core.abort_requests_async(reqs_to_abort) except Exception as e: - logger.error(e) + logger.exception("EngineCore output handler hit an error: %s", e) kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: From 492084b1f7342c051ce3a0f83451099a75935bde Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 30 Dec 2024 13:53:11 +0000 Subject: [PATCH 12/13] fix --- vllm/engine/llm_engine.py | 1 - vllm/entrypoints/openai/serving_completion.py | 3 --- vllm/v1/engine/core_client.py | 16 ++++++---------- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index aa7de60725dce..1db3e59ff3bae 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -616,7 +616,6 @@ def _add_processed_request( decoder_inputs = processed_inputs encoder_inputs = None - print(f"{decoder_inputs=}") seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, lora_request, prompt_adapter_request) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index da9e16fa427cc..aaad7b8c7f44c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -105,7 +105,6 @@ async def create_completion( tokenizer = await self.engine_client.get_tokenizer(lora_request) - print(f"{request.prompt=}") request_prompts, engine_prompts = await self._preprocess_completion( request, tokenizer, @@ -113,8 +112,6 @@ async def create_completion( truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - print(f"{request_prompts=}") - print(f"{engine_prompts=}") except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index beb5d57c20c83..3293205e110af 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -6,7 +6,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import get_open_zmq_ipc_path +from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) @@ -144,17 +144,13 @@ def __init__( else: self.ctx = zmq.Context() # type: ignore[attr-defined] - # Path for IPC. + # Paths and sockets for IPC. output_path = get_open_zmq_ipc_path() input_path = get_open_zmq_ipc_path() - - # Get output (EngineCoreOutput) from EngineCore. - self.output_socket = self.ctx.socket(zmq.constants.PULL) - self.output_socket.connect(output_path) - - # Send input (EngineCoreRequest) to EngineCore. - self.input_socket = self.ctx.socket(zmq.constants.PUSH) - self.input_socket.bind(input_path) + self.output_socket = make_zmq_socket(self.ctx, output_path, + zmq.constants.PULL) + self.input_socket = make_zmq_socket(self.ctx, input_path, + zmq.constants.PUSH) # Start EngineCore in background process. self.proc_handle: Optional[BackgroundProcHandle] From 0b229879d6a2600321d3cab7e825efeaff1adfba Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 30 Dec 2024 13:53:33 +0000 Subject: [PATCH 13/13] fix --- vllm/transformers_utils/tokenizer_group/tokenizer_group.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index c7249efa6d52f..95a8f7098bbac 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -57,9 +57,7 @@ def encode(self, request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None) -> List[int]: tokenizer = self.get_lora_tokenizer(lora_request) - print(f"{prompt=}") ret = tokenizer.encode(prompt) - print(f"{ret=}") self._raise_if_input_too_long(ret, lora_request) return ret