From c2ad07cf492a02020ba3fcf13186ab77007d3657 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 03:42:28 +0000 Subject: [PATCH 001/132] first rev of 3 process architecture --- examples/openai_completion_client.py | 8 +- vllm/v1/engine/__init__.py | 23 ++- vllm/v1/engine/async_llm.py | 52 ++++--- vllm/v1/engine/core.py | 78 ++++------ vllm/v1/engine/core_client.py | 47 ++---- vllm/v1/engine/detokenizer.py | 201 ++++++++++++++++++++++++- vllm/v1/executor/multiproc_executor.py | 6 +- vllm/v1/utils.py | 58 +++++-- 8 files changed, 343 insertions(+), 130 deletions(-) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index 58519f978d340..1f8b82bc5c9e9 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -2,7 +2,7 @@ # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" +openai_api_base = "http://localhost:8001/v1" client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") @@ -14,14 +14,12 @@ model = models.data[0].id # Completion API -stream = False +stream = True completion = client.completions.create( model=model, prompt="A robot may not injure a human being", echo=False, - n=2, - stream=stream, - logprobs=3) + stream=stream) print("Completion results:") if stream: diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index cc0c7ea23469a..ee6b90b1bab1f 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,5 +1,6 @@ import enum from dataclasses import dataclass +from multiprocessing.process import BaseProcess from typing import List, Optional, Union import msgspec @@ -10,7 +11,18 @@ @dataclass -class DetokenizerRequest: +class BackgroundProcHandle: + proc: BaseProcess + ready_path: str + input_path: str + output_path: str + + +class DetokenizerRequest( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False): # type: ignore[call-arg] request_id: str prompt: Optional[str] @@ -75,6 +87,15 @@ class EngineCoreProfile: is_start: bool +class DetokenizerRequestType(enum.Enum): + """ + Request types defined as hex byte strings, so it can be sent over sockets + without separate encoding step. + """ + NEW = b'\x00' + OUT = b'\x01' + + class EngineCoreRequestType(enum.Enum): """ Request types defined as hex byte strings, so it can be sent over sockets diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b36de5f66917c..6b158ca5f667b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -16,9 +16,10 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext +from vllm.utils import get_open_zmq_ipc_path from vllm.v1.engine.async_stream import AsyncStream -from vllm.v1.engine.core_client import EngineCoreClient -from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.core_client import AsyncMPClient +from vllm.v1.engine.detokenizer import DetokenizerClient from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor @@ -64,8 +65,13 @@ def __init__( vllm_config.lora_config, self.tokenizer, input_registry) + + # IPC path for EngineCore -> Detokenizer. + engine_core_outputs_path = get_open_zmq_ipc_path() + # Detokenizer (converts EngineCoreOutputs --> RequestOutput). - self.detokenizer = Detokenizer( + self.detokenizer = DetokenizerClient( + engine_core_outputs_path=engine_core_outputs_path, tokenizer_name=vllm_config.model_config.tokenizer, tokenizer_mode=vllm_config.model_config.tokenizer_mode, trust_remote_code=vllm_config.model_config.trust_remote_code, @@ -73,12 +79,11 @@ def __init__( ) # EngineCore (starts the engine in background process). - self.engine_core = EngineCoreClient.make_client( + self.engine_core = AsyncMPClient( + output_path=engine_core_outputs_path, vllm_config=vllm_config, executor_class=executor_class, usage_context=usage_context, - multiprocess_mode=True, - asyncio_mode=True, ) self.output_handler: Optional[asyncio.Task] = None @@ -121,6 +126,9 @@ def shutdown(self): if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() + + if detokenizer := getattr(self, "detokenizer", None): + detokenizer.shutdown() if handler := getattr(self, "output_handler", None): handler.cancel() @@ -152,8 +160,8 @@ async def add_request( ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: """Add new request to the AsyncLLM.""" - if self.detokenizer.is_request_active(request_id): - raise ValueError(f"Request {request_id} already exists.") + # if self.detokenizer.is_request_active(request_id): + # raise ValueError(f"Request {request_id} already exists.") # 1) Create a new AsyncStream for the request. stream = self._add_request_to_streams(request_id) @@ -163,10 +171,10 @@ async def add_request( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 3) Add the request to Detokenizer (this process). - self.detokenizer.add_request(detokenizer_req) + # 3) Add the DetokenizerRequest to Detokenizer. + await self.detokenizer.add_request_async(detokenizer_req) - # 4) Add the EngineCoreRequest to EngineCore (separate process). + # 4) Add the EngineCoreRequest to EngineCore. await self.engine_core.add_request_async(engine_core_req) # 5) Return the generator. @@ -296,29 +304,26 @@ async def _run_output_handler(self): try: while True: - # 1) Pull EngineCoreOutput from the EngineCore. - outputs = await self.engine_core.get_output_async() + # 1) Pull outputs from the Detokenizer. + request_outputs, reqs_to_abort = ( + await self.detokenizer.get_output_async()) - # 2) Detokenize based on the output. - request_outputs, reqs_to_abort = self.detokenizer.step(outputs) - - # 3) Put the RequestOutputs into the per-request AsyncStreams. + # 2) Put the RequestOutputs into the per-request AsyncStreams. self._process_request_outputs(request_outputs) - # 4) Abort any requests that finished due to stop strings. + # 3) Abort any requests that finished due to stop strings. await self.engine_core.abort_requests_async(reqs_to_abort) - # 5) Abort any requests due to client cancellations. + # 4) Abort any requests due to client cancellations. + # TODO: send back to detokenizer if this fails. await self._process_cancellations() except BaseException as e: logger.error(e) raise e - # TODO: can we eliminate these? - async def abort(self, request_id: str) -> None: - # Note: Who Calls this? I dont think this is actually used. + # Note: this is not used outside of testing. raise ValueError("Not Supported on V1 yet.") def encode( @@ -345,8 +350,7 @@ async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: - assert lora_request is None - return self.detokenizer.tokenizer + return self.tokenizer.get_lora_tokenizer(lora_request) async def is_tracing_enabled(self) -> bool: return False diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 56d4dc67e4a0e..dcafac4ad2463 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,8 +3,6 @@ import signal import threading import time -from dataclasses import dataclass -from multiprocessing.process import BaseProcess from typing import List, Tuple, Type import zmq @@ -17,15 +15,18 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext +from vllm.utils import get_open_zmq_ipc_path from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion) + EngineCoreRequestType, EngineCoreRequestUnion, + DetokenizerRequestType, + BackgroundProcHandle) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import PickleEncoder -from vllm.v1.utils import make_zmq_socket +from vllm.v1.utils import zmq_socket_ctx, wait_for_startup from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -134,14 +135,6 @@ def profile(self, is_start: bool = True): self.model_executor.profile(is_start) -@dataclass -class EngineCoreProcHandle: - proc: BaseProcess - ready_path: str - input_path: str - output_path: str - - class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" @@ -173,38 +166,11 @@ def __init__( daemon=True).start() # Send Readiness signal to EngineClient. - with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket: + logger.info("ABOUT TO SEND READINESS") + with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: + logger.info("SENDING READY SIGNAL") ready_socket.send_string(EngineCoreProc.READY_STR) - @staticmethod - def wait_for_startup( - proc: BaseProcess, - ready_path: str, - ) -> None: - """Wait until the EngineCore is ready.""" - - try: - sync_ctx = zmq.Context() # type: ignore[attr-defined] - socket = sync_ctx.socket(zmq.constants.PULL) - socket.connect(ready_path) - - # Wait for EngineCore to send EngineCoreProc.READY_STR. - while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: - logger.debug("Waiting for EngineCoreProc to startup.") - - if not proc.is_alive(): - raise RuntimeError("EngineCoreProc failed to start.") - - message = socket.recv_string() - assert message == EngineCoreProc.READY_STR - - except BaseException as e: - logger.exception(e) - raise e - - finally: - sync_ctx.destroy(linger=0) - @staticmethod def make_engine_core_process( vllm_config: VllmConfig, @@ -212,9 +178,9 @@ def make_engine_core_process( usage_context: UsageContext, input_path: str, output_path: str, - ready_path: str, - ) -> EngineCoreProcHandle: + ) -> BackgroundProcHandle: context = get_mp_context() + ready_path = get_open_zmq_ipc_path() process_kwargs = { "input_path": input_path, @@ -228,10 +194,14 @@ def make_engine_core_process( proc = context.Process(target=EngineCoreProc.run_engine_core, kwargs=process_kwargs) proc.start() - - # Wait for startup - EngineCoreProc.wait_for_startup(proc, ready_path) - return EngineCoreProcHandle(proc=proc, + logger.info("WAITING FOR STARTUP") + wait_for_startup(proc=proc, + ready_path=ready_path, + ready_str=EngineCoreProc.READY_STR, + timeout_ms=POLLING_TIMEOUT_MS) + logger.info("READY") + + return BackgroundProcHandle(proc=proc, ready_path=ready_path, input_path=input_path, output_path=output_path) @@ -284,12 +254,13 @@ def run_busy_loop(self): if not self.scheduler.has_unfinished_requests(): while True: try: + logger.info("getting from input queue") req = self.input_queue.get(timeout=POLLING_TIMEOUT_S) self._handle_client_request(req) break except queue.Empty: self._log_stats() - logger.debug("EngineCore busy loop waiting.") + logger.info("EngineCore busy loop waiting.") except BaseException: raise @@ -302,6 +273,7 @@ def run_busy_loop(self): outputs = self.step() # 4) Put EngineCoreOutputs into the output queue. + logger.info("putting to output queue") self.output_queue.put_nowait(outputs) self._log_stats() @@ -339,7 +311,7 @@ def process_input_socket(self, input_path: str): decoder_add_req = PickleEncoder() decoder_abort_req = PickleEncoder() - with make_zmq_socket(input_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket: while True: # (RequestType, RequestData) type_frame, data_frame = socket.recv_multipart(copy=False) @@ -367,9 +339,11 @@ def process_output_socket(self, output_path: str): # Reuse send buffer. buffer = bytearray() - with make_zmq_socket(output_path, zmq.constants.PUSH) as socket: + logger.info(f"{output_path=}") + with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) encoder.encode_into(outputs, buffer) - socket.send_multipart((buffer, ), copy=False) + msg = (DetokenizerRequestType.OUT.value, buffer) + socket.send_multipart(msg, copy=False) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ff25a9b2e9cac..cfd3edab13877 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -7,13 +7,14 @@ import zmq.asyncio from vllm.logger import init_logger -from vllm.utils import get_open_zmq_ipc_path, kill_process_tree -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, +from vllm.utils import kill_process_tree, get_open_zmq_ipc_path +from vllm.v1.engine import (BackgroundProcHandle, + EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) -from vllm.v1.engine.core import (EngineCore, EngineCoreProc, - EngineCoreProcHandle) +from vllm.v1.engine.core import (EngineCore, EngineCoreProc) from vllm.v1.serial_utils import PickleEncoder +from vllm.v1.utils import make_zmq_socket logger = init_logger(__name__) @@ -129,6 +130,7 @@ class MPClient(EngineCoreClient): def __init__( self, *args, + output_path: str, asyncio_mode: bool, **kwargs, ): @@ -142,27 +144,19 @@ def __init__( else: self.ctx = zmq.Context() # type: ignore[attr-defined] - # Path for IPC. - ready_path = get_open_zmq_ipc_path() - output_path = get_open_zmq_ipc_path() input_path = get_open_zmq_ipc_path() - - # Get output (EngineCoreOutput) from EngineCore. - self.output_socket = self.ctx.socket(zmq.constants.PULL) - self.output_socket.connect(output_path) - - # Send input (EngineCoreRequest) to EngineCore. - self.input_socket = self.ctx.socket(zmq.constants.PUSH) - self.input_socket.bind(input_path) + self.input_socket = make_zmq_socket( + self.ctx, + input_path, + zmq.constants.PUSH, + ) # Start EngineCore in background process. - self.proc_handle: Optional[EngineCoreProcHandle] + self.proc_handle: Optional[BackgroundProcHandle] self.proc_handle = EngineCoreProc.make_engine_core_process( *args, - input_path= - input_path, # type: ignore[misc] # MyPy incorrectly flags duplicate keywords - output_path=output_path, # type: ignore[misc] - ready_path=ready_path, # type: ignore[misc] + input_path=input_path, + output_path=output_path, **kwargs, ) atexit.register(self.shutdown) @@ -207,12 +201,6 @@ class SyncMPClient(MPClient): def __init__(self, *args, **kwargs): super().__init__(*args, asyncio_mode=False, **kwargs) - def get_output(self) -> List[EngineCoreOutput]: - - (frame, ) = self.output_socket.recv_multipart(copy=False) - engine_core_outputs = self.decoder.decode(frame.buffer).outputs - return engine_core_outputs - def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: @@ -237,13 +225,6 @@ class AsyncMPClient(MPClient): def __init__(self, *args, **kwargs): super().__init__(*args, asyncio_mode=True, **kwargs) - async def get_output_async(self) -> List[EngineCoreOutput]: - - frames = await self.output_socket.recv_multipart(copy=False) - engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs - - return engine_core_outputs - async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 02f34e2b54dd5..a2f8c4a29b662 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,17 +1,29 @@ +import pickle +import zmq.asyncio +import msgspec +import signal from dataclasses import dataclass from typing import Dict, Iterable, List, Optional, Tuple, Union from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput +from vllm.utils import get_open_zmq_ipc_path +from vllm.v1.engine import (DetokenizerRequest, DetokenizerRequestType, + EngineCoreOutput, EngineCoreOutputs, + BackgroundProcHandle,) +from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, + wait_for_startup) +from vllm.v1.serial_utils import PickleEncoder logger = init_logger(__name__) +POLLING_TIMEOUT_MS = 5000 @dataclass class IncrementalDetokenizer: @@ -270,3 +282,190 @@ def step( # Return to EngineClient. return request_outputs, requests_to_abort + +class DetokenizerProc(Detokenizer): + """ZMQ-wrapper for running Detokenizer in background process.""" + + READY_STR = "READY" + + def __init__( + self, + *args, + engine_core_outputs_path: str, + input_path: str, + output_path: str, + ready_path: str, + **kwargs + ): + super().__init__(*args, **kwargs) + + self.engine_core_outputs_path = engine_core_outputs_path + self.input_path = input_path + self.output_path = output_path + + # Send readiness signal. + with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: + ready_socket.send_string(DetokenizerProc.READY_STR) + + + @staticmethod + def make_detokenizer_process( + engine_core_outputs_path: str, + input_path: str, + output_path: str, + tokenizer_name: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + revision: Optional[str] = None, + ) -> BackgroundProcHandle: + context = get_mp_context() + ready_path = get_open_zmq_ipc_path() + + process_kwargs = { + "engine_core_outputs_path": engine_core_outputs_path, + "input_path": input_path, + "output_path": output_path, + "ready_path": ready_path, + "tokenizer_name": tokenizer_name, + "tokenizer_mode": tokenizer_mode, + "trust_remote_code": trust_remote_code, + "revision": revision, + } + # Run Detokenizer busy loop in background process. + proc = context.Process(target=DetokenizerProc.run_detokenizer, + kwargs=process_kwargs) + proc.start() + wait_for_startup(proc=proc, + ready_path=ready_path, + ready_str=DetokenizerProc.READY_STR, + timeout_ms=POLLING_TIMEOUT_MS) + + return BackgroundProcHandle(proc=proc, + ready_path=ready_path, + input_path=input_path, + output_path=output_path) + + @staticmethod + def run_detokenizer(*args, **kwargs): + """Launch Detokenizer busy loop in background process.""" + + # Signal handler used for graceful termination. + # SystemExit exception is only raised once to allow this and worker + # processes to terminate without error + shutdown_requested = False + + def signal_handler(signum, frame): + nonlocal shutdown_requested + if not shutdown_requested: + shutdown_requested = True + raise SystemExit() + + # Either SIGTERM or SIGINT will terminate the engine_core + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + + detokenizer = None + try: + detokenizer = DetokenizerProc(*args, **kwargs) + detokenizer.run_busy_loop() + + except SystemExit: + logger.debug("Detokenizer interrupted.") + + except BaseException as e: + logger.exception(e) + raise e + + finally: + if detokenizer is not None: + detokenizer = None + + def run_busy_loop(self): + """Core busy loop of the Detokenizer.""" + + decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest) + decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs) + + with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, + zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket, + zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket): + + # TODO: make this work without poll by having both EngineCore + # and AsyncLLM send to the same socket (unclear why this was not working + # when I originally tried it) + poller = zmq.Poller() + poller.register(engine_core_outputs_socket, zmq.POLLIN) + poller.register(input_socket, zmq.POLLIN) + + while True: + socks = dict(poller.poll()) + + # Handle NewRequest + if input_socket in socks: + (frame, ) = input_socket.recv_multipart(copy=False) + detokenizer_request = decoder_new.decode(frame.buffer) + self.add_request(detokenizer_request) + + # Handle EngineCoreOutput + if from_engine_core_socket in socks: + (frame, ) = from_engine_core_socket.recv_multipart(copy=False) + engine_core_outputs = decoder_out.decode(frame.buffer).outputs + outputs = self.step(engine_core_outputs) + msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL) + output_socket.send_multipart((msg, ), copy=False) + + # TODO: handle aborted due to client cancellation + # TODO: pickle -> msgpack + # TODO: send stop string aborts back to EngineCore directly + + +class DetokenizerClient: + + def __init__(self, *args, engine_core_outputs_path: str, **kwargs): + + # Serialization setup. + self.encoder = msgspec.msgpack.Encoder() + self.decoder = PickleEncoder() + + # ZMQ setup. + self.ctx = zmq.asyncio.Context() + + # Get input (DetokenizerRequest) to Detokenizer. + input_path = get_open_zmq_ipc_path() + self.input_socket = make_zmq_socket( + self.ctx, + input_path, + zmq.constants.PUSH, + ) + + # Get output (RequestOutput) from Detokenizer. + output_path = get_open_zmq_ipc_path() + self.output_socket = make_zmq_socket( + self.ctx, + output_path, + zmq.constants.PULL, + ) + self.output_socket.connect(output_path) + + # Start Detokenizer in background process. + self.proc_handle: Optional[BackgroundProcHandle] + self.proc_handle = DetokenizerProc.make_detokenizer_process( + *args, + engine_core_outputs_path=engine_core_outputs_path, + input_path=input_path, + output_path=output_path, + **kwargs, + ) + + async def add_request_async(self, request: DetokenizerRequest): + """Send new DetokenizerRequest to Detokenizer.""" + + msg = (DetokenizerRequestType.NEW.value, self.encoder.encode(request)) + await self.input_socket.send_multipart(msg, copy=False) + + + async def get_output_async(self) -> Tuple[List[RequestOutput], List[str]]: + """Get RequestOutputs, RequestsToAbort from Detokenizer.""" + + (frame, ) = await self.output_socket.recv_multipart(copy=False) + return self.decoder.decode(frame.buffer) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 17441dacdc5cf..aa246f778f8f3 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -23,7 +23,7 @@ get_open_zmq_ipc_path) from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.utils import make_zmq_socket +from vllm.v1.utils import zmq_socket_ctx from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -254,7 +254,7 @@ def __init__( worker_response_mq_handle = self.worker_response_mq.export_handle() # Send Readiness signal to EngineCore process. - with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket: + with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: payload = pickle.dumps(worker_response_mq_handle, protocol=pickle.HIGHEST_PROTOCOL) ready_socket.send_string(WorkerProc.READY_STR) @@ -356,7 +356,7 @@ def wait_for_startup( ready_path: str, ) -> Optional[Handle]: """Wait until the Worker is ready.""" - with make_zmq_socket(ready_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: # Wait for Worker to send READY. while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 5f327d7066830..ecd98f246b064 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,3 +1,5 @@ +from multiprocessing.process import BaseProcess + from collections import OrderedDict from collections.abc import Sequence from contextlib import contextmanager @@ -5,6 +7,7 @@ overload) import zmq +import zmq.asyncio from vllm.logger import init_logger @@ -78,24 +81,33 @@ def __len__(self): return len(self._x) -@contextmanager def make_zmq_socket( + ctx: Union[zmq.asyncio.Context, zmq.Context], + path: str, + type: Any + ) -> Union[zmq.Socket, zmq.asyncio.Socket]: + """Make a ZMQ socket with the proper bind/connext semantics.""" + + socket = ctx.socket(type) + + if type == zmq.constants.PULL: + socket.connect(path) + elif type == zmq.constants.PUSH: + socket.bind(path) + else: + raise ValueError(f"Unknown Socket Type: {type}") + + return socket + +@contextmanager +def zmq_socket_ctx( path: str, type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] """Context manager for a ZMQ socket""" ctx = zmq.Context() # type: ignore[attr-defined] try: - socket = ctx.socket(type) - - if type == zmq.constants.PULL: - socket.connect(path) - elif type == zmq.constants.PUSH: - socket.bind(path) - else: - raise ValueError(f"Unknown Socket Type: {type}") - - yield socket + yield make_zmq_socket(ctx, path, type) except KeyboardInterrupt: logger.debug("Worker had Keyboard Interrupt.") @@ -104,6 +116,30 @@ def make_zmq_socket( ctx.destroy(linger=0) +def wait_for_startup( + proc: BaseProcess, + ready_path: str, + ready_str: str, + timeout_ms: int, +) -> None: + """Wait until a background process is ready.""" + + with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: + try: + while socket.poll(timeout=timeout_ms) == 0: + logger.debug("Waiting for background proc to startup.") + + if not proc.is_alive(): + raise RuntimeError("Background process failed to start.") + + message = socket.recv_string() + assert message == ready_str + + except BaseException as e: + logger.exception(e) + raise e + + K = TypeVar('K') V = TypeVar('V') From f0b3e36e48c121b5139e36ec871e1631bab13afd Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 03:45:15 +0000 Subject: [PATCH 002/132] finally able to generate text --- vllm/v1/engine/core.py | 6 ++---- vllm/v1/engine/detokenizer.py | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index dcafac4ad2463..f8b49abd5b385 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -194,12 +194,10 @@ def make_engine_core_process( proc = context.Process(target=EngineCoreProc.run_engine_core, kwargs=process_kwargs) proc.start() - logger.info("WAITING FOR STARTUP") wait_for_startup(proc=proc, ready_path=ready_path, ready_str=EngineCoreProc.READY_STR, timeout_ms=POLLING_TIMEOUT_MS) - logger.info("READY") return BackgroundProcHandle(proc=proc, ready_path=ready_path, @@ -273,7 +271,6 @@ def run_busy_loop(self): outputs = self.step() # 4) Put EngineCoreOutputs into the output queue. - logger.info("putting to output queue") self.output_queue.put_nowait(outputs) self._log_stats() @@ -345,5 +342,6 @@ def process_output_socket(self, output_path: str): engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) encoder.encode_into(outputs, buffer) - msg = (DetokenizerRequestType.OUT.value, buffer) + # msg = (DetokenizerRequestType.OUT.value, buffer) + msg = (buffer, ) socket.send_multipart(msg, copy=False) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index a2f8c4a29b662..5dff221166314 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -407,8 +407,8 @@ def run_busy_loop(self): self.add_request(detokenizer_request) # Handle EngineCoreOutput - if from_engine_core_socket in socks: - (frame, ) = from_engine_core_socket.recv_multipart(copy=False) + if engine_core_outputs_socket in socks: + (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False) engine_core_outputs = decoder_out.decode(frame.buffer).outputs outputs = self.step(engine_core_outputs) msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL) @@ -460,7 +460,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): async def add_request_async(self, request: DetokenizerRequest): """Send new DetokenizerRequest to Detokenizer.""" - msg = (DetokenizerRequestType.NEW.value, self.encoder.encode(request)) + msg = (self.encoder.encode(request), ) await self.input_socket.send_multipart(msg, copy=False) From ce8aa2c35cad19e56f3899d8df2627750b600cf3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 04:16:51 +0000 Subject: [PATCH 003/132] breaking under load --- vllm/v1/engine/async_llm.py | 11 +++++- vllm/v1/engine/core.py | 3 +- vllm/v1/engine/detokenizer.py | 74 ++++++++++++++++++----------------- 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6b158ca5f667b..d007712b6359e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -286,7 +286,9 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): for request_output in request_outputs: request_id = request_output.request_id - assert request_id in self.request_streams + logger.debug("about to assert") + if request_id not in self.request_streams: + raise ValueError("%s not found in Request Steams", request_id) # Each request in the API server pulls from the per-request stream. stream = self.request_streams.get(request_id) @@ -305,20 +307,24 @@ async def _run_output_handler(self): try: while True: # 1) Pull outputs from the Detokenizer. + logger.debug("get_output_async") request_outputs, reqs_to_abort = ( await self.detokenizer.get_output_async()) # 2) Put the RequestOutputs into the per-request AsyncStreams. + logger.debug("_process_request_outputs") self._process_request_outputs(request_outputs) # 3) Abort any requests that finished due to stop strings. + logger.debug("abort_requests_async") await self.engine_core.abort_requests_async(reqs_to_abort) # 4) Abort any requests due to client cancellations. # TODO: send back to detokenizer if this fails. + logger.debug("process_cancellations") await self._process_cancellations() - except BaseException as e: + except Exception as e: logger.error(e) raise e @@ -350,6 +356,7 @@ async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: + logger.debug("Called get_tokenizer.") return self.tokenizer.get_lora_tokenizer(lora_request) async def is_tracing_enabled(self) -> bool: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f8b49abd5b385..8af753f4f39ab 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -33,7 +33,7 @@ POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = 5000 +LOGGING_TIME_S = 5 class EngineCore: @@ -252,7 +252,6 @@ def run_busy_loop(self): if not self.scheduler.has_unfinished_requests(): while True: try: - logger.info("getting from input queue") req = self.input_queue.get(timeout=POLLING_TIMEOUT_S) self._handle_client_request(req) break diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 5dff221166314..1caa656870ddc 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -383,41 +383,45 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the Detokenizer.""" - decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest) - decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs) - - with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, - zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket, - zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket): - - # TODO: make this work without poll by having both EngineCore - # and AsyncLLM send to the same socket (unclear why this was not working - # when I originally tried it) - poller = zmq.Poller() - poller.register(engine_core_outputs_socket, zmq.POLLIN) - poller.register(input_socket, zmq.POLLIN) - - while True: - socks = dict(poller.poll()) - - # Handle NewRequest - if input_socket in socks: - (frame, ) = input_socket.recv_multipart(copy=False) - detokenizer_request = decoder_new.decode(frame.buffer) - self.add_request(detokenizer_request) - - # Handle EngineCoreOutput - if engine_core_outputs_socket in socks: - (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False) - engine_core_outputs = decoder_out.decode(frame.buffer).outputs - outputs = self.step(engine_core_outputs) - msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL) - output_socket.send_multipart((msg, ), copy=False) - - # TODO: handle aborted due to client cancellation - # TODO: pickle -> msgpack - # TODO: send stop string aborts back to EngineCore directly - + try: + # TODO: handle aborted due to client cancellation + # TODO: pickle -> msgpack + # TODO: send stop string aborts back to EngineCore directly + + decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest) + decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs) + + with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, + zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket, + zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket): + + # TODO: make this work without poll by having both EngineCore + # and AsyncLLM send to the same socket (unclear why this was not working + # when I originally tried it) + poller = zmq.Poller() + poller.register(engine_core_outputs_socket, zmq.POLLIN) + poller.register(input_socket, zmq.POLLIN) + + while True: + socks = dict(poller.poll()) + + # Handle NewRequest + if input_socket in socks: + (frame, ) = input_socket.recv_multipart(copy=False) + detokenizer_request = decoder_new.decode(frame.buffer) + self.add_request(detokenizer_request) + + # Handle EngineCoreOutput + if engine_core_outputs_socket in socks: + (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False) + engine_core_outputs = decoder_out.decode(frame.buffer).outputs + outputs = self.step(engine_core_outputs) + msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL) + output_socket.send_multipart((msg, ), copy=False) + + except Exception as e: + logger.error(e) + raise e class DetokenizerClient: From 457d6184b8521a43d139abe3e0a3e22d4b956dc5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 15:42:29 +0000 Subject: [PATCH 004/132] working e2e --- benchmarks/backend_request_func.py | 4 +++- vllm/v1/core/scheduler.py | 2 +- vllm/v1/engine/async_llm.py | 8 +------- vllm/v1/engine/core.py | 5 +---- vllm/v1/engine/detokenizer.py | 14 ++++++++------ 5 files changed, 14 insertions(+), 19 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index b67849038cf0d..1374768dc3def 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -242,7 +242,9 @@ async def async_request_openai_completions( "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + # "ignore_eos": request_func_input.ignore_eos, + "ignore_eos": False, + } if request_func_input.extra_body: payload.update(request_func_input.extra_body) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index f76364f64033d..b44d72afae94a 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -424,7 +424,7 @@ def update_from_output( # Check for stop and update request state. # This must be called before me make the EngineCoreOutput. stopped = self._check_stop(request) - + # Add EngineCoreOutput for this Request. output = EngineCoreOutput( request_id=req_id, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d007712b6359e..f6a52213be965 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -286,9 +286,8 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): for request_output in request_outputs: request_id = request_output.request_id - logger.debug("about to assert") if request_id not in self.request_streams: - raise ValueError("%s not found in Request Steams", request_id) + raise ValueError(f"{request_id} not in AsyncStreams") # Each request in the API server pulls from the per-request stream. stream = self.request_streams.get(request_id) @@ -307,21 +306,17 @@ async def _run_output_handler(self): try: while True: # 1) Pull outputs from the Detokenizer. - logger.debug("get_output_async") request_outputs, reqs_to_abort = ( await self.detokenizer.get_output_async()) # 2) Put the RequestOutputs into the per-request AsyncStreams. - logger.debug("_process_request_outputs") self._process_request_outputs(request_outputs) # 3) Abort any requests that finished due to stop strings. - logger.debug("abort_requests_async") await self.engine_core.abort_requests_async(reqs_to_abort) # 4) Abort any requests due to client cancellations. # TODO: send back to detokenizer if this fails. - logger.debug("process_cancellations") await self._process_cancellations() except Exception as e: @@ -356,7 +351,6 @@ async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: - logger.debug("Called get_tokenizer.") return self.tokenizer.get_lora_tokenizer(lora_request) async def is_tracing_enabled(self) -> bool: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 8af753f4f39ab..30422891413c6 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -166,9 +166,7 @@ def __init__( daemon=True).start() # Send Readiness signal to EngineClient. - logger.info("ABOUT TO SEND READINESS") with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: - logger.info("SENDING READY SIGNAL") ready_socket.send_string(EngineCoreProc.READY_STR) @staticmethod @@ -257,7 +255,7 @@ def run_busy_loop(self): break except queue.Empty: self._log_stats() - logger.info("EngineCore busy loop waiting.") + logger.debug("EngineCore busy loop waiting.") except BaseException: raise @@ -335,7 +333,6 @@ def process_output_socket(self, output_path: str): # Reuse send buffer. buffer = bytearray() - logger.info(f"{output_path=}") with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: engine_core_outputs = self.output_queue.get() diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 1caa656870ddc..a1c384cd45967 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -277,6 +277,9 @@ def step( # Free completed requests. if request_output.finished: self.request_states.pop(request_id) + # If Request finished but EngineCore not finished, + # this was caused by a stop string + we need to send + # an abort signal to the EngineCore. if not engine_core_output.finished: requests_to_abort.append(request_id) @@ -395,9 +398,9 @@ def run_busy_loop(self): zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket, zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket): - # TODO: make this work without poll by having both EngineCore - # and AsyncLLM send to the same socket (unclear why this was not working - # when I originally tried it) + # TODO: avoid poll by having both EngineCore + # and AsyncLLM send to the same socket (unclear why this + # was not working when I originally tried it) poller = zmq.Poller() poller.register(engine_core_outputs_socket, zmq.POLLIN) poller.register(input_socket, zmq.POLLIN) @@ -405,13 +408,13 @@ def run_busy_loop(self): while True: socks = dict(poller.poll()) - # Handle NewRequest + # Handle NewRequest. if input_socket in socks: (frame, ) = input_socket.recv_multipart(copy=False) detokenizer_request = decoder_new.decode(frame.buffer) self.add_request(detokenizer_request) - # Handle EngineCoreOutput + # Handle EngineCoreOutput. if engine_core_outputs_socket in socks: (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False) engine_core_outputs = decoder_out.decode(frame.buffer).outputs @@ -449,7 +452,6 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): output_path, zmq.constants.PULL, ) - self.output_socket.connect(output_path) # Start Detokenizer in background process. self.proc_handle: Optional[BackgroundProcHandle] From c980dbd50574b36edbd95cafb822652db09274e0 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 16:34:11 +0000 Subject: [PATCH 005/132] workign e2e --- examples/openai_completion_client.py | 8 +- vllm/entrypoints/openai/serving_completion.py | 10 +- vllm/v1/engine/async_llm.py | 198 ++++++++++-------- vllm/v1/engine/core.py | 2 +- 4 files changed, 124 insertions(+), 94 deletions(-) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index 58519f978d340..1f8b82bc5c9e9 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -2,7 +2,7 @@ # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" +openai_api_base = "http://localhost:8001/v1" client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") @@ -14,14 +14,12 @@ model = models.data[0].id # Completion API -stream = False +stream = True completion = client.completions.create( model=model, prompt="A robot may not injure a human being", echo=False, - n=2, - stream=stream, - logprobs=3) + stream=stream) print("Completion results:") if stream: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index bd39a4c42e938..d87c410c0124c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -159,8 +159,10 @@ async def create_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - result_generator = merge_async_iterators( - *generators, is_cancelled=raw_request.is_disconnected) + # result_generator = merge_async_iterators( + # *generators, is_cancelled=raw_request.is_disconnected) + assert len(generators) == 1 + result_generator = generators[0] model_name = self._get_model_name(lora_request) num_prompts = len(engine_prompts) @@ -256,7 +258,9 @@ async def completion_stream_generator( include_usage, include_continuous_usage = False, False try: - async for prompt_idx, res in result_generator: + # async for prompt_idx, res in result_generator: + async for res in result_generator: + prompt_idx = 0 prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs prompt_text = res.prompt diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b36de5f66917c..0c5eda420b102 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -54,10 +54,11 @@ def __init__( lora_config=vllm_config.lora_config) self.tokenizer.ping() - # Request streams (map of request_id -> AsyncStream). - self.request_streams: Dict[str, AsyncStream] = {} - # List of cancelled request ids to be aborted. - self.client_aborted_requests: List[str] = [] + # # Request streams (map of request_id -> AsyncStream). + # self.request_streams: Dict[str, AsyncStream] = {} + # # List of cancelled request ids to be aborted. + # self.client_aborted_requests: List[str] = [] + self.rid_to_state = {} # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor(vllm_config.model_config, @@ -149,14 +150,18 @@ async def add_request( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: + ) -> asyncio.Event: """Add new request to the AsyncLLM.""" if self.detokenizer.is_request_active(request_id): raise ValueError(f"Request {request_id} already exists.") - # 1) Create a new AsyncStream for the request. - stream = self._add_request_to_streams(request_id) + state = { + "out_list": [], + "event": asyncio.Event(), + "finished": False, + } + self.rid_to_state[request_id] = state # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. detokenizer_req, engine_core_req = self.processor.process_inputs( @@ -169,8 +174,7 @@ async def add_request( # 4) Add the EngineCoreRequest to EngineCore (separate process). await self.engine_core.add_request_async(engine_core_req) - # 5) Return the generator. - return stream.generator() + return state # TODO: we should support multiple prompts in one call, as you # can do with LLM.generate. So that for multi-prompt completion @@ -209,87 +213,113 @@ async def generate( self.output_handler = asyncio.create_task( self._run_output_handler()) - async for output in await self.add_request( - request_id, - prompt, - sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, - priority=priority, - ): - yield output - - def _finish_stream(self, request_id: str): - stream = self.request_streams.pop(request_id, None) - if stream is not None: - stream.finish() - - def _add_request_to_streams( - self, - request_id: str, - ) -> AsyncStream: - - if request_id in self.request_streams: - raise ValueError(f"Request id {request_id} already running.") - - # Avoid streams having circular ref to parent AsyncLLM object. - aborted_reqs = self.client_aborted_requests - stream = AsyncStream(request_id, aborted_reqs.append) - self.request_streams[request_id] = stream - - if self.log_requests: - logger.info("Added request %s.", request_id) - - return stream - - async def _process_cancellations(self) -> None: - """ - Process requests cancelled from user disconnecting. - - When a client disconnects, AsyncStream._cancel() is called. - We passed a callback to AsyncStream(), which appends to - self.client_aborted_requests. - - As a result, if any requests are canceled from the user side - the request_id will show up in self.client_aborted_requests. - """ - - # Avoid streams having circular ref to parent AsyncLLM object. - if not self.client_aborted_requests: - return - reqs_to_abort = self.client_aborted_requests.copy() - self.client_aborted_requests.clear() - - # Remove from Detokenizer. - self.detokenizer.abort_requests(reqs_to_abort) - - # Remove from RequestStreams. - for request_id in reqs_to_abort: - if self.log_requests: - logger.info("User-cancelled request %s.", request_id) - self._finish_stream(request_id) - - # Remove from EngineCore. - await self.engine_core.abort_requests_async(reqs_to_abort) + state = await self.add_request( + request_id, + prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, + ) + + while True: + try: + await asyncio.wait_for(state["event"].wait(), timeout=4) + out = state["out_list"][-1] + except asyncio.TimeoutError: + # if request is not None and await request.is_disconnected(): + # self.abort_request(obj.rid) + # raise ValueError(f"Abort request {obj.rid}") + continue + + state["out_list"] = [] + if state["finished"]: + del self.rid_to_state[request_id] + yield out + break + + state["event"].clear() + yield out + + # def _finish_stream(self, request_id: str): + # stream = self.request_streams.pop(request_id, None) + # if stream is not None: + # stream.finish() + + # def _add_request_to_streams( + # self, + # request_id: str, + # ) -> AsyncStream: + + # if request_id in self.request_streams: + # raise ValueError(f"Request id {request_id} already running.") + + # # Avoid streams having circular ref to parent AsyncLLM object. + # aborted_reqs = self.client_aborted_requests + # stream = AsyncStream(request_id, aborted_reqs.append) + # self.request_streams[request_id] = stream + + # if self.log_requests: + # logger.info("Added request %s.", request_id) + + # return stream + + # async def _process_cancellations(self) -> None: + # """ + # Process requests cancelled from user disconnecting. + + # When a client disconnects, AsyncStream._cancel() is called. + # We passed a callback to AsyncStream(), which appends to + # self.client_aborted_requests. + + # As a result, if any requests are canceled from the user side + # the request_id will show up in self.client_aborted_requests. + # """ + + # # Avoid streams having circular ref to parent AsyncLLM object. + # if not self.client_aborted_requests: + # return + # reqs_to_abort = self.client_aborted_requests.copy() + # self.client_aborted_requests.clear() + + # # Remove from Detokenizer. + # self.detokenizer.abort_requests(reqs_to_abort) + + # # Remove from RequestStreams. + # for request_id in reqs_to_abort: + # if self.log_requests: + # logger.info("User-cancelled request %s.", request_id) + # self._finish_stream(request_id) + + # # Remove from EngineCore. + # await self.engine_core.abort_requests_async(reqs_to_abort) def _process_request_outputs(self, request_outputs: List[RequestOutput]): """Process outputs by putting them into per-request AsyncStreams.""" for request_output in request_outputs: request_id = request_output.request_id - assert request_id in self.request_streams + assert request_id in self.rid_to_state + state = self.rid_to_state[request_id] + + if request_output.finished: + state["finished"] = True + + state["out_list"].append(request_output) + state["event"].set() - # Each request in the API server pulls from the per-request stream. - stream = self.request_streams.get(request_id) - if stream is not None: - stream.put(request_output) - # If finished, remove from the tracker. - if request_output.finished: - if self.log_requests: - logger.info("Finished request %s.", request_id) - self._finish_stream(request_id) + # # Each request in the API server pulls from the per-request stream. + # stream = self.request_streams.get(request_id) + # if stream is not None: + # stream.put(request_output) + + # # If finished, remove from the tracker. + # if request_output.finished: + # if self.log_requests: + # logger.info("Finished request %s.", request_id) + # self._finish_stream(request_id) async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" @@ -309,14 +339,12 @@ async def _run_output_handler(self): await self.engine_core.abort_requests_async(reqs_to_abort) # 5) Abort any requests due to client cancellations. - await self._process_cancellations() + # await self._process_cancellations() except BaseException as e: logger.error(e) raise e - # TODO: can we eliminate these? - async def abort(self, request_id: str) -> None: # Note: Who Calls this? I dont think this is actually used. raise ValueError("Not Supported on V1 yet.") diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 56d4dc67e4a0e..470ad80ee045c 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -32,7 +32,7 @@ POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = 5000 +LOGGING_TIME_S = 5 class EngineCore: From cba2d54ae522f7ec43a1b3efb3e73bf1e4b8465b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 19:06:46 +0000 Subject: [PATCH 006/132] stash --- benchmarks/backend_request_func.py | 2 + benchmarks/benchmark_serving.py | 6 ++ examples/openai_completion_client.py | 2 +- vllm/entrypoints/openai/protocol.py | 5 +- vllm/entrypoints/openai/serving_completion.py | 68 ++++++++++--------- vllm/v1/engine/async_llm.py | 50 ++++++++------ 6 files changed, 77 insertions(+), 56 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index b67849038cf0d..7b324b9e9897a 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -40,6 +40,7 @@ class RequestFuncOutput: tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" + total_chunks: int = 0 async def async_request_tgi( @@ -269,6 +270,7 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") + output.total_chunks += 1 if chunk == "[DONE]": latency = time.perf_counter() - st else: diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4eb0e1f8ac903..96dc55a078960 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -630,6 +630,12 @@ async def limited_request_func(request_func_input, pbar): pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + total_chunks = 0 + for output in outputs: + total_chunks += output.total_chunks + + print(f"TOTAL_CHUNKS: {total_chunks}") + if profile: print("Stopping profiler...") profile_input = RequestFuncInput( diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index 1f8b82bc5c9e9..d905d18f151f0 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -24,6 +24,6 @@ print("Completion results:") if stream: for c in completion: - print(c) + print(c.choices[0].text) else: print(completion) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6ed7c2e9dcd6b..f0f04d2e512be 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -737,8 +737,9 @@ def to_sampling_params( logits_processors=get_logits_processors(self.logits_processors, logits_processor_pattern), truncate_prompt_tokens=self.truncate_prompt_tokens, - output_kind=RequestOutputKind.DELTA if self.stream \ - else RequestOutputKind.FINAL_ONLY, + # output_kind=RequestOutputKind.DELTA if self.stream \ + # else RequestOutputKind.FINAL_ONLY, + output_kind=RequestOutputKind.CUMULATIVE, guided_decoding=guided_decoding, logit_bias=self.logit_bias, allowed_token_ids=self.allowed_token_ids) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index d87c410c0124c..254937aab829c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -257,6 +257,8 @@ async def completion_stream_generator( else: include_usage, include_continuous_usage = False, False + streamed_text = "" + try: # async for prompt_idx, res in result_generator: async for res in result_generator: @@ -278,28 +280,29 @@ async def completion_stream_generator( assert request.max_tokens is not None if request.echo and not has_echoed[i]: - assert prompt_token_ids is not None - assert prompt_text is not None - if request.max_tokens == 0: - # only return the prompt - delta_text = prompt_text - delta_token_ids = prompt_token_ids - out_logprobs = prompt_logprobs - else: - assert prompt_logprobs is not None - # echo the prompt and first token - delta_text = prompt_text + output.text - delta_token_ids = [ - *prompt_token_ids, *output.token_ids - ] - out_logprobs = [ - *prompt_logprobs, - *(output.logprobs or []), - ] - has_echoed[i] = True + pass + # assert prompt_token_ids is not None + # assert prompt_text is not None + # if request.max_tokens == 0: + # # only return the prompt + # delta_text = prompt_text + # delta_token_ids = prompt_token_ids + # out_logprobs = prompt_logprobs + # else: + # assert prompt_logprobs is not None + # # echo the prompt and first token + # delta_text = prompt_text + output.text + # delta_token_ids = [ + # *prompt_token_ids, *output.token_ids + # ] + # out_logprobs = [ + # *prompt_logprobs, + # *(output.logprobs or []), + # ] + # has_echoed[i] = True else: # return just the delta - delta_text = output.text + delta_text = output.text[previous_text_lens[i]:] delta_token_ids = output.token_ids out_logprobs = output.logprobs @@ -309,20 +312,23 @@ async def completion_stream_generator( continue if request.logprobs is not None: - assert out_logprobs is not None, ( - "Did not output logprobs") - logprobs = self._create_completion_logprobs( - token_ids=delta_token_ids, - top_logprobs=out_logprobs, - num_output_top_logprobs=request.logprobs, - tokenizer=tokenizer, - initial_text_offset=previous_text_lens[i], - ) + pass + # assert out_logprobs is not None, ( + # "Did not output logprobs") + # logprobs = self._create_completion_logprobs( + # token_ids=delta_token_ids, + # top_logprobs=out_logprobs, + # num_output_top_logprobs=request.logprobs, + # tokenizer=tokenizer, + # initial_text_offset=previous_text_lens[i], + # ) else: logprobs = None - previous_text_lens[i] += len(output.text) - previous_num_tokens[i] += len(output.token_ids) + # previous_text_lens[i] += len(output.text) + # previous_num_tokens[i] += len(output.token_ids) + previous_text_lens[i] = len(output.text) + previous_num_tokens[i] = len(output.token_ids) finish_reason = output.finish_reason stop_reason = output.stop_reason diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0c5eda420b102..dd2ebb06bdf2e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,4 +1,5 @@ import asyncio +from dataclasses import dataclass from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -16,7 +17,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.v1.engine.async_stream import AsyncStream from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor @@ -24,6 +24,17 @@ logger = init_logger(__name__) +@dataclass +class RequestState: + + event: asyncio.Event + out_list: List[RequestOutput] + finished: bool + + @classmethod + def new(cls) -> "RequestState": + return cls(asyncio.Event(), [], False) + class AsyncLLM(EngineClient): @@ -46,6 +57,9 @@ def __init__( self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config + # RequestId -> RequestState. + self.rid_to_state: Dict[str, RequestState] = {} + # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -54,12 +68,6 @@ def __init__( lora_config=vllm_config.lora_config) self.tokenizer.ping() - # # Request streams (map of request_id -> AsyncStream). - # self.request_streams: Dict[str, AsyncStream] = {} - # # List of cancelled request ids to be aborted. - # self.client_aborted_requests: List[str] = [] - self.rid_to_state = {} - # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor(vllm_config.model_config, vllm_config.lora_config, self.tokenizer, @@ -156,11 +164,9 @@ async def add_request( if self.detokenizer.is_request_active(request_id): raise ValueError(f"Request {request_id} already exists.") - state = { - "out_list": [], - "event": asyncio.Event(), - "finished": False, - } + # 1) Add to RequestState tracker. The "event" is used to manage + # concurrency between generate() and output_handler task. + state = RequestState.new() self.rid_to_state[request_id] = state # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. @@ -224,22 +230,22 @@ async def generate( ) while True: - try: - await asyncio.wait_for(state["event"].wait(), timeout=4) - out = state["out_list"][-1] + try: + await asyncio.wait_for(state.event.wait(), timeout=4) + out = state.out_list[-1] except asyncio.TimeoutError: # if request is not None and await request.is_disconnected(): # self.abort_request(obj.rid) # raise ValueError(f"Abort request {obj.rid}") continue - state["out_list"] = [] - if state["finished"]: + state.out_list = [] + if state.finished: del self.rid_to_state[request_id] yield out break - state["event"].clear() + state.event.clear() yield out # def _finish_stream(self, request_id: str): @@ -304,10 +310,10 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): state = self.rid_to_state[request_id] if request_output.finished: - state["finished"] = True - - state["out_list"].append(request_output) - state["event"].set() + state.finished = True + + state.out_list.append(request_output) + state.event.set() # # Each request in the API server pulls from the per-request stream. From 3ae44a8b53496ee54a3e200c612a3f8c03a366ee Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 22:40:36 +0000 Subject: [PATCH 007/132] stash --- examples/openai_completion_client.py | 9 ++- vllm/entrypoints/openai/protocol.py | 8 +- vllm/entrypoints/openai/serving_completion.py | 81 +++++++++---------- vllm/v1/engine/async_llm.py | 6 +- 4 files changed, 57 insertions(+), 47 deletions(-) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index d905d18f151f0..bbc6f27ece2fe 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -18,12 +18,17 @@ completion = client.completions.create( model=model, prompt="A robot may not injure a human being", - echo=False, + echo=True, + n=1, + logprobs=2, stream=stream) print("Completion results:") +text = "" if stream: for c in completion: - print(c.choices[0].text) + text += c.choices[0].text + print(c) + print(text) else: print(completion) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f0f04d2e512be..3ed78edd8f84e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -9,6 +9,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Annotated +from vllm.envs import VLLM_USE_V1 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.logger import init_logger from vllm.pooling_params import PoolingParams @@ -38,6 +39,11 @@ assert _LONG_INFO.max == _MOCK_LONG_INFO.max +STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.DELTA +if VLLM_USE_V1: + STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.CUMULATIVE + + class OpenAIBaseModel(BaseModel): # OpenAI API does allow extra fields model_config = ConfigDict(extra="allow") @@ -422,7 +428,7 @@ def to_sampling_params( logits_processor_pattern), include_stop_str_in_output=self.include_stop_str_in_output, truncate_prompt_tokens=self.truncate_prompt_tokens, - output_kind=RequestOutputKind.DELTA if self.stream \ + output_kind=STREAM_SAMPLING_OUTPUT_KIND if self.stream \ else RequestOutputKind.FINAL_ONLY, guided_decoding=guided_decoding, logit_bias=self.logit_bias) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 254937aab829c..c7416cd6ac492 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -159,10 +159,8 @@ async def create_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - # result_generator = merge_async_iterators( - # *generators, is_cancelled=raw_request.is_disconnected) - assert len(generators) == 1 - result_generator = generators[0] + result_generator = merge_async_iterators( + *generators, is_cancelled=raw_request.is_disconnected) model_name = self._get_model_name(lora_request) num_prompts = len(engine_prompts) @@ -257,11 +255,10 @@ async def completion_stream_generator( else: include_usage, include_continuous_usage = False, False - streamed_text = "" - try: - # async for prompt_idx, res in result_generator: - async for res in result_generator: + async for prompt_idx, res in result_generator: + # Output.text is cumulative + prompt_idx = 0 prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs @@ -280,31 +277,32 @@ async def completion_stream_generator( assert request.max_tokens is not None if request.echo and not has_echoed[i]: - pass - # assert prompt_token_ids is not None - # assert prompt_text is not None - # if request.max_tokens == 0: - # # only return the prompt - # delta_text = prompt_text - # delta_token_ids = prompt_token_ids - # out_logprobs = prompt_logprobs - # else: - # assert prompt_logprobs is not None - # # echo the prompt and first token - # delta_text = prompt_text + output.text - # delta_token_ids = [ - # *prompt_token_ids, *output.token_ids - # ] - # out_logprobs = [ - # *prompt_logprobs, - # *(output.logprobs or []), - # ] - # has_echoed[i] = True + assert prompt_token_ids is not None + assert prompt_text is not None + # If we not echoed, we have not sent text yet. + assert previous_text_lens[i] == 0 + assert previous_num_tokens[i] == 0 + if request.max_tokens == 0: + # only return the prompt + delta_text = prompt_text + delta_token_ids = prompt_token_ids + out_logprobs = prompt_logprobs + else: + assert prompt_logprobs is not None + # echo the prompt and first token + delta_text = prompt_text + output.text + delta_token_ids = [ + *prompt_token_ids, *output.token_ids + ] + out_logprobs = [ + *prompt_logprobs, + *(output.logprobs or []), + ] + has_echoed[i] = True else: - # return just the delta delta_text = output.text[previous_text_lens[i]:] - delta_token_ids = output.token_ids - out_logprobs = output.logprobs + delta_token_ids = output.token_ids[previous_num_tokens[i]:] + out_logprobs = output.logprobs[previous_num_tokens[i]:] if output.logprobs else None if not delta_text and not delta_token_ids \ and not previous_num_tokens[i]: @@ -312,21 +310,18 @@ async def completion_stream_generator( continue if request.logprobs is not None: - pass - # assert out_logprobs is not None, ( - # "Did not output logprobs") - # logprobs = self._create_completion_logprobs( - # token_ids=delta_token_ids, - # top_logprobs=out_logprobs, - # num_output_top_logprobs=request.logprobs, - # tokenizer=tokenizer, - # initial_text_offset=previous_text_lens[i], - # ) + assert out_logprobs is not None, ( + "Did not output logprobs") + logprobs = self._create_completion_logprobs( + token_ids=delta_token_ids, + top_logprobs=out_logprobs, + num_output_top_logprobs=request.logprobs, + tokenizer=tokenizer, + initial_text_offset=previous_text_lens[i], + ) else: logprobs = None - # previous_text_lens[i] += len(output.text) - # previous_num_tokens[i] += len(output.token_ids) previous_text_lens[i] = len(output.text) previous_num_tokens[i] = len(output.token_ids) finish_reason = output.finish_reason diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index dd2ebb06bdf2e..d78ac92127ef0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -24,6 +24,8 @@ logger = init_logger(__name__) +WAITING_TIMEOUT_MS=5 + @dataclass class RequestState: @@ -231,9 +233,11 @@ async def generate( while True: try: - await asyncio.wait_for(state.event.wait(), timeout=4) + await asyncio.wait_for(state.event.wait(), timeout=WAITING_TIMEOUT_MS) out = state.out_list[-1] + except asyncio.TimeoutError: + logger.debug("Timeout waiting for %s", request_id) # if request is not None and await request.is_disconnected(): # self.abort_request(obj.rid) # raise ValueError(f"Abort request {obj.rid}") From 3ef56872f00324fdf5d01c9786f29f874690a25d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 22:43:06 +0000 Subject: [PATCH 008/132] remove async stream --- vllm/v1/engine/async_stream.py | 55 ---------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 vllm/v1/engine/async_stream.py diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py deleted file mode 100644 index 35449238c3259..0000000000000 --- a/vllm/v1/engine/async_stream.py +++ /dev/null @@ -1,55 +0,0 @@ -import asyncio -from typing import Any, AsyncGenerator, Callable, Optional, Type, Union - -from vllm.outputs import PoolingRequestOutput, RequestOutput - - -class AsyncStream: - """A stream of RequestOutputs or PoolingRequestOutputs for a request - that can be iterated over asynchronously via an async generator.""" - - STOP_ITERATION = Exception() # Sentinel - - def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: - self.request_id = request_id - self._cancel = cancel - self._queue: asyncio.Queue = asyncio.Queue() - self._finished = False - - def put(self, item: Union[RequestOutput, PoolingRequestOutput, - Exception]) -> None: - if not self._finished: - self._queue.put_nowait(item) - - def finish( - self, - exception: Optional[Union[BaseException, Type[BaseException]]] = None, - ) -> None: - if not self._finished: - self._finished = True - self._queue.put_nowait(exception if self._is_raisable(exception) - else AsyncStream.STOP_ITERATION) - - async def generator( - self - ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: - finished = False - try: - while True: - result = await self._queue.get() - if self._is_raisable(result): - finished = True - if result == AsyncStream.STOP_ITERATION: - return - raise result - yield result - finally: - self._finished = True - if not finished: - self._cancel(self.request_id) - - @staticmethod - def _is_raisable(value: Any): - return isinstance(value, BaseException) or \ - (isinstance(value, type) and \ - issubclass(value, BaseException)) From b350084e509276081331597ab9c8c36e3465aec9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 22:54:51 +0000 Subject: [PATCH 009/132] fix protocol --- vllm/entrypoints/openai/protocol.py | 5 +- vllm/v1/engine/async_llm.py | 96 ++++++++++++----------------- 2 files changed, 42 insertions(+), 59 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3ed78edd8f84e..e3c6a33ef430d 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -743,9 +743,8 @@ def to_sampling_params( logits_processors=get_logits_processors(self.logits_processors, logits_processor_pattern), truncate_prompt_tokens=self.truncate_prompt_tokens, - # output_kind=RequestOutputKind.DELTA if self.stream \ - # else RequestOutputKind.FINAL_ONLY, - output_kind=RequestOutputKind.CUMULATIVE, + output_kind=STREAM_SAMPLING_OUTPUT_KIND if self.stream \ + else RequestOutputKind.FINAL_ONLY, guided_decoding=guided_decoding, logit_bias=self.logit_bias, allowed_token_ids=self.allowed_token_ids) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d78ac92127ef0..2668190a46d03 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -201,17 +201,17 @@ async def generate( ) -> AsyncGenerator[RequestOutput, None]: """ Main function called by the API server to kick off a request - * 1) Making an AsyncStream corresponding to the Request. + * 1) Making an RequestState corresponding to the Request. # 2) Processing the Input. * 3) Adding the Request to the Detokenizer. * 4) Adding the Request to the EngineCore (separate process). - A separate output_handler loop runs in a background AsyncIO task, - pulling outputs from EngineCore and putting them into the - per-request AsyncStream. + A separate output_handler loop runs in a background task, + pulling outputs from EngineCore and updating the RequestState and + setting the asyncio Event. - The caller of generate() iterates the returned AsyncGenerator, - returning the RequestOutput back to the caller. + The caller of generate() waits on the asyncio event and forwards + the latest RequestOutput back to the caller. """ # We start the output_handler on the first call to generate() so that @@ -232,15 +232,22 @@ async def generate( ) while True: - try: + try: await asyncio.wait_for(state.event.wait(), timeout=WAITING_TIMEOUT_MS) + + # NOTE(rob): out_list can have more than one item. However, in the + # streaming case, we use RequestOutputKind.CUMULATIVE, which has the + # full generated text output (not just the text corresponding to the + # last token). So, we can just send the last item and the API Client + # handles converting the stream buffer into a delta text. This way + # we do "dynamic chunked streaming", such that the API client does not + # fall behind the EngineCore (which happens at high QPS othwerwise). out = state.out_list[-1] except asyncio.TimeoutError: logger.debug("Timeout waiting for %s", request_id) - # if request is not None and await request.is_disconnected(): - # self.abort_request(obj.rid) - # raise ValueError(f"Abort request {obj.rid}") + + # TODO (rob): do request cancellation checking here. continue state.out_list = [] @@ -252,58 +259,35 @@ async def generate( state.event.clear() yield out - # def _finish_stream(self, request_id: str): - # stream = self.request_streams.pop(request_id, None) - # if stream is not None: - # stream.finish() - - # def _add_request_to_streams( - # self, - # request_id: str, - # ) -> AsyncStream: - - # if request_id in self.request_streams: - # raise ValueError(f"Request id {request_id} already running.") - - # # Avoid streams having circular ref to parent AsyncLLM object. - # aborted_reqs = self.client_aborted_requests - # stream = AsyncStream(request_id, aborted_reqs.append) - # self.request_streams[request_id] = stream - - # if self.log_requests: - # logger.info("Added request %s.", request_id) - - # return stream - - # async def _process_cancellations(self) -> None: - # """ - # Process requests cancelled from user disconnecting. + async def _process_cancellations(self) -> None: + """ + Process requests cancelled from user disconnecting. - # When a client disconnects, AsyncStream._cancel() is called. - # We passed a callback to AsyncStream(), which appends to - # self.client_aborted_requests. + When a client disconnects, AsyncStream._cancel() is called. + We passed a callback to AsyncStream(), which appends to + self.client_aborted_requests. - # As a result, if any requests are canceled from the user side - # the request_id will show up in self.client_aborted_requests. - # """ + As a result, if any requests are canceled from the user side + the request_id will show up in self.client_aborted_requests. + """ - # # Avoid streams having circular ref to parent AsyncLLM object. - # if not self.client_aborted_requests: - # return - # reqs_to_abort = self.client_aborted_requests.copy() - # self.client_aborted_requests.clear() + # Avoid streams having circular ref to parent AsyncLLM object. + if not self.client_aborted_requests: + return + reqs_to_abort = self.client_aborted_requests.copy() + self.client_aborted_requests.clear() - # # Remove from Detokenizer. - # self.detokenizer.abort_requests(reqs_to_abort) + # Remove from Detokenizer. + self.detokenizer.abort_requests(reqs_to_abort) - # # Remove from RequestStreams. - # for request_id in reqs_to_abort: - # if self.log_requests: - # logger.info("User-cancelled request %s.", request_id) - # self._finish_stream(request_id) + # Remove from RequestStreams. + for request_id in reqs_to_abort: + if self.log_requests: + logger.info("User-cancelled request %s.", request_id) + self._finish_stream(request_id) - # # Remove from EngineCore. - # await self.engine_core.abort_requests_async(reqs_to_abort) + # Remove from EngineCore. + await self.engine_core.abort_requests_async(reqs_to_abort) def _process_request_outputs(self, request_outputs: List[RequestOutput]): """Process outputs by putting them into per-request AsyncStreams.""" From abd7fa39adcab529f6f6cfa668916a831075d2f2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 22:55:35 +0000 Subject: [PATCH 010/132] clean up completion client --- vllm/entrypoints/openai/serving_completion.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index c7416cd6ac492..1f0a5db439580 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -257,9 +257,6 @@ async def completion_stream_generator( try: async for prompt_idx, res in result_generator: - # Output.text is cumulative - - prompt_idx = 0 prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs prompt_text = res.prompt From 6986457965b559534cbc2d3d9879113e2ea31ecd Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:08:18 +0000 Subject: [PATCH 011/132] stash --- vllm/v1/engine/async_llm.py | 39 ++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 2668190a46d03..c49a55e9d1139 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -13,7 +13,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import SamplingParams, RequestOutputKind from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext @@ -28,6 +28,10 @@ @dataclass class RequestState: + """RequestState manages concurrency between the output_handler, + which pulls outputs from EngineCore and the user-facing generate() + function the + """ event: asyncio.Event out_list: List[RequestOutput] @@ -59,9 +63,6 @@ def __init__( self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config - # RequestId -> RequestState. - self.rid_to_state: Dict[str, RequestState] = {} - # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -70,6 +71,11 @@ def __init__( lora_config=vllm_config.lora_config) self.tokenizer.ping() + # RequestId -> RequestState. + self.rid_to_state: Dict[str, RequestState] = {} + # List of cancelled request ids to be aborted. + self.client_aborted_requests: List[str] = [] + # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor(vllm_config.model_config, vllm_config.lora_config, self.tokenizer, @@ -160,16 +166,15 @@ async def add_request( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> asyncio.Event: + ) -> RequestState: """Add new request to the AsyncLLM.""" if self.detokenizer.is_request_active(request_id): raise ValueError(f"Request {request_id} already exists.") # 1) Add to RequestState tracker. The "event" is used to manage - # concurrency between generate() and output_handler task. - state = RequestState.new() - self.rid_to_state[request_id] = state + # concurrency between generate() and output_handler(). + self.rid_to_state[request_id] = RequestState.new() # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. detokenizer_req, engine_core_req = self.processor.process_inputs( @@ -182,7 +187,7 @@ async def add_request( # 4) Add the EngineCoreRequest to EngineCore (separate process). await self.engine_core.add_request_async(engine_core_req) - return state + return self.rid_to_state[request_id] # TODO: we should support multiple prompts in one call, as you # can do with LLM.generate. So that for multi-prompt completion @@ -201,19 +206,22 @@ async def generate( ) -> AsyncGenerator[RequestOutput, None]: """ Main function called by the API server to kick off a request - * 1) Making an RequestState corresponding to the Request. + * 1) Make RequestState corresponding to the Request. # 2) Processing the Input. * 3) Adding the Request to the Detokenizer. * 4) Adding the Request to the EngineCore (separate process). - A separate output_handler loop runs in a background task, - pulling outputs from EngineCore and updating the RequestState and - setting the asyncio Event. + The output_handler() loop runs in a background task, pulling outputs from + EngineCore and updating the RequestState and setting the asyncio event. The caller of generate() waits on the asyncio event and forwards - the latest RequestOutput back to the caller. + the latest RequestOutput back to the caller. """ + # DELTA streaming is not supported due to dynamic chunking. + assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE or + sampling_params.output_kind == RequestOutputKind.FINAL_ONLY) + # We start the output_handler on the first call to generate() so that # we can call __init__ before the event loop starts, which enables us # to handle startup failure gracefully in the OpenAI server. @@ -245,9 +253,8 @@ async def generate( out = state.out_list[-1] except asyncio.TimeoutError: + # TODO(rob): do request cancellation checking here. logger.debug("Timeout waiting for %s", request_id) - - # TODO (rob): do request cancellation checking here. continue state.out_list = [] From 816e9658a7122cee6eab3e43e5cc45fdf2250e03 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:09:03 +0000 Subject: [PATCH 012/132] updated --- vllm/v1/engine/async_llm.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c49a55e9d1139..3b755935909e4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -310,18 +310,6 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): state.out_list.append(request_output) state.event.set() - - # # Each request in the API server pulls from the per-request stream. - # stream = self.request_streams.get(request_id) - # if stream is not None: - # stream.put(request_output) - - # # If finished, remove from the tracker. - # if request_output.finished: - # if self.log_requests: - # logger.info("Finished request %s.", request_id) - # self._finish_stream(request_id) - async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" From cebf2870b517a10430e9b2faeb28c9516131291d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:11:35 +0000 Subject: [PATCH 013/132] updated comment --- vllm/v1/engine/async_llm.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 3b755935909e4..c1e6b8940a9c4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -300,16 +300,17 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): """Process outputs by putting them into per-request AsyncStreams.""" for request_output in request_outputs: - request_id = request_output.request_id - assert request_id in self.rid_to_state - state = self.rid_to_state[request_id] - + assert request_output.request_id in self.rid_to_state + + # Update the RequestState and alert generate() that there + # is a RequestOutput ready to return to the user. + state = self.rid_to_state[request_output.request_id] if request_output.finished: state.finished = True - state.out_list.append(request_output) state.event.set() + async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" From adcc3d291fa22e81f838bbafc52516a2d9b7c5a9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:28:35 +0000 Subject: [PATCH 014/132] remove comptibility --- vllm/entrypoints/openai/serving_completion.py | 56 +++++++++++++++---- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 1f0a5db439580..99878be97acaf 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -27,7 +27,8 @@ PromptAdapterPath) from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.sampling_params import BeamSearchParams, SamplingParams +from vllm.sampling_params import (BeamSearchParams, SamplingParams, + RequestOutputKind) from vllm.sequence import Logprob from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import merge_async_iterators @@ -182,7 +183,9 @@ async def create_completion( model_name, num_prompts=num_prompts, tokenizer=tokenizer, - request_metadata=request_metadata) + request_metadata=request_metadata, + output_kind=sampling_params.output_kind, + ) # Non-streaming response final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts @@ -240,7 +243,32 @@ async def completion_stream_generator( num_prompts: int, tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, + output_kind: RequestOutputKind, ) -> AsyncGenerator[str, None]: + """ + In V0, we use RequestOutputType.DELTA and each RequestOutput + from the result_generator is guarenteed to correspond to + a single token. + + To handle this, we can simply constuct the Streaming + + In V1, we use RequestOutputType.CUMULATIVE and each RequestOutput + from the result_genrator is not guarenteed to correspond to + a single token (it could correspond to 2+ tokens). + + To handle this, we need to maintain state around how many + characters and tokens have been returned so far, and dynamically + stream back just the delta (where the delta could be the text + corresponding to N tokens). + + We do this to dynamically adjust how much work the API server + is doing. If the QPS is high and streaming becomes a bottleneck, + such that the API server falls behind, we dynamically fall back + to streaming chunks of tokens. + """ + assert (output_kind == RequestOutputKind.CUMULATIVE or + output_kind == RequestOutputKind.DELTA) + num_choices = 1 if request.n is None else request.n previous_text_lens = [0] * num_choices * num_prompts previous_num_tokens = [0] * num_choices * num_prompts @@ -276,9 +304,6 @@ async def completion_stream_generator( if request.echo and not has_echoed[i]: assert prompt_token_ids is not None assert prompt_text is not None - # If we not echoed, we have not sent text yet. - assert previous_text_lens[i] == 0 - assert previous_num_tokens[i] == 0 if request.max_tokens == 0: # only return the prompt delta_text = prompt_text @@ -297,9 +322,15 @@ async def completion_stream_generator( ] has_echoed[i] = True else: - delta_text = output.text[previous_text_lens[i]:] - delta_token_ids = output.token_ids[previous_num_tokens[i]:] - out_logprobs = output.logprobs[previous_num_tokens[i]:] if output.logprobs else None + if output_kind == RequestOutputKind.CUMULATIVE: + delta_text = output.text[previous_text_lens[i]:] + delta_token_ids = output.token_ids[previous_num_tokens[i]:] + out_logprobs = (output.logprobs[previous_num_tokens[i]:] if + output.logprobs else None) + else: + delta_text = output.text + delta_token_ids = output.token_ids + out_logprobs = output.logprobs if not delta_text and not delta_token_ids \ and not previous_num_tokens[i]: @@ -319,8 +350,13 @@ async def completion_stream_generator( else: logprobs = None - previous_text_lens[i] = len(output.text) - previous_num_tokens[i] = len(output.token_ids) + if output_kind == RequestOutputKind.CUMULATIVE: + previous_text_lens[i] = len(output.text) + previous_num_tokens[i] = len(output.token_ids) + else: + previous_text_lens[i] += len(output.text) + previous_num_tokens[i] += len(output.token_ids) + finish_reason = output.finish_reason stop_reason = output.stop_reason From 4344f1bb986abc31f7ba0a8b292878d62b9c1f30 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:34:07 +0000 Subject: [PATCH 015/132] format --- benchmarks/benchmark_serving.py | 4 +- vllm/entrypoints/openai/protocol.py | 3 +- vllm/entrypoints/openai/serving_completion.py | 25 ++++++------ vllm/v1/engine/async_llm.py | 38 ++++++++++--------- 4 files changed, 36 insertions(+), 34 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 96dc55a078960..e9a9ef366004a 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -630,12 +630,12 @@ async def limited_request_func(request_func_input, pbar): pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) - total_chunks = 0 + total_chunks = 0 for output in outputs: total_chunks += output.total_chunks print(f"TOTAL_CHUNKS: {total_chunks}") - + if profile: print("Stopping profiler...") profile_input = RequestFuncInput( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e3c6a33ef430d..cfc02013dd8c5 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -9,8 +9,8 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Annotated -from vllm.envs import VLLM_USE_V1 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +from vllm.envs import VLLM_USE_V1 from vllm.logger import init_logger from vllm.pooling_params import PoolingParams from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, @@ -38,7 +38,6 @@ assert _LONG_INFO.min == _MOCK_LONG_INFO.min assert _LONG_INFO.max == _MOCK_LONG_INFO.max - STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.DELTA if VLLM_USE_V1: STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.CUMULATIVE diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 99878be97acaf..f8f00d53bd30f 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -27,8 +27,8 @@ PromptAdapterPath) from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.sampling_params import (BeamSearchParams, SamplingParams, - RequestOutputKind) +from vllm.sampling_params import (BeamSearchParams, RequestOutputKind, + SamplingParams) from vllm.sequence import Logprob from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import merge_async_iterators @@ -172,6 +172,7 @@ async def create_completion( stream = (request.stream and (request.best_of is None or request.n == request.best_of) and not request.use_beam_search) + assert isinstance(sampling_params, SamplingParams) # Streaming response if stream: @@ -247,13 +248,11 @@ async def completion_stream_generator( ) -> AsyncGenerator[str, None]: """ In V0, we use RequestOutputType.DELTA and each RequestOutput - from the result_generator is guarenteed to correspond to - a single token. - - To handle this, we can simply constuct the Streaming + from the result_generator is guaranteed to correspond to + a single token so In V1, we use RequestOutputType.CUMULATIVE and each RequestOutput - from the result_genrator is not guarenteed to correspond to + from the result_generator is not guaranteed to correspond to a single token (it could correspond to 2+ tokens). To handle this, we need to maintain state around how many @@ -266,8 +265,8 @@ async def completion_stream_generator( such that the API server falls behind, we dynamically fall back to streaming chunks of tokens. """ - assert (output_kind == RequestOutputKind.CUMULATIVE or - output_kind == RequestOutputKind.DELTA) + assert (output_kind == RequestOutputKind.CUMULATIVE + or output_kind == RequestOutputKind.DELTA) num_choices = 1 if request.n is None else request.n previous_text_lens = [0] * num_choices * num_prompts @@ -324,9 +323,11 @@ async def completion_stream_generator( else: if output_kind == RequestOutputKind.CUMULATIVE: delta_text = output.text[previous_text_lens[i]:] - delta_token_ids = output.token_ids[previous_num_tokens[i]:] - out_logprobs = (output.logprobs[previous_num_tokens[i]:] if - output.logprobs else None) + delta_token_ids = output.token_ids[ + previous_num_tokens[i]:] + out_logprobs = ( + output.logprobs[previous_num_tokens[i]:] + if output.logprobs else None) else: delta_text = output.text delta_token_ids = output.token_ids diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c1e6b8940a9c4..434378cc4ada0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -10,10 +10,10 @@ from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.outputs import PoolingRequestOutput, RequestOutput +from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import SamplingParams, RequestOutputKind +from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext @@ -24,7 +24,8 @@ logger = init_logger(__name__) -WAITING_TIMEOUT_MS=5 +WAITING_TIMEOUT_MS = 5 + @dataclass class RequestState: @@ -211,7 +212,7 @@ async def generate( * 3) Adding the Request to the Detokenizer. * 4) Adding the Request to the EngineCore (separate process). - The output_handler() loop runs in a background task, pulling outputs from + The output_handler() loop runs in a background task, pulling from EngineCore and updating the RequestState and setting the asyncio event. The caller of generate() waits on the asyncio event and forwards @@ -219,8 +220,8 @@ async def generate( """ # DELTA streaming is not supported due to dynamic chunking. - assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE or - sampling_params.output_kind == RequestOutputKind.FINAL_ONLY) + assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE + or sampling_params.output_kind == RequestOutputKind.FINAL_ONLY) # We start the output_handler on the first call to generate() so that # we can call __init__ before the event loop starts, which enables us @@ -238,18 +239,20 @@ async def generate( prompt_adapter_request=prompt_adapter_request, priority=priority, ) - + while True: try: - await asyncio.wait_for(state.event.wait(), timeout=WAITING_TIMEOUT_MS) - - # NOTE(rob): out_list can have more than one item. However, in the - # streaming case, we use RequestOutputKind.CUMULATIVE, which has the - # full generated text output (not just the text corresponding to the - # last token). So, we can just send the last item and the API Client - # handles converting the stream buffer into a delta text. This way - # we do "dynamic chunked streaming", such that the API client does not - # fall behind the EngineCore (which happens at high QPS othwerwise). + await asyncio.wait_for(state.event.wait(), + timeout=WAITING_TIMEOUT_MS) + + # NOTE(rob): out_list can have more than one item. However, + # in the streaming case, we use RequestOutputKind.CUMULATIVE, + # which has the full generated text output (not just the text + # corresponding to the last token). So, we can just send the + # last RequestOutput and the API Client handles converting into + # a delta text. This way we do "dynamic chunked streaming", such + # that the API client does not fall behind the EngineCor, + # which happens at high QPS otherwise. out = state.out_list[-1] except asyncio.TimeoutError: @@ -301,7 +304,7 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): for request_output in request_outputs: assert request_output.request_id in self.rid_to_state - + # Update the RequestState and alert generate() that there # is a RequestOutput ready to return to the user. state = self.rid_to_state[request_output.request_id] @@ -310,7 +313,6 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): state.out_list.append(request_output) state.event.set() - async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" From d7b42a02125d9a2efdf3053d6376d4accfb90c58 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:38:18 +0000 Subject: [PATCH 016/132] format/comments --- vllm/v1/engine/async_llm.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 434378cc4ada0..e03d502c258b8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -29,18 +29,24 @@ @dataclass class RequestState: - """RequestState manages concurrency between the output_handler, - which pulls outputs from EngineCore and the user-facing generate() - function the + """ + RequestState manages concurrency between: + * the output_handler(), which pulls outputs from EngineCore + * the per-request generate(), which is the interface to client code. + + The output_handler adds new RequestOutputs to out_list and sets the + asyncio event, notifying the generate() that there is work to do. + + generate() waits on the asyncio event and yields the data from + out_list back to the caller generate() """ event: asyncio.Event out_list: List[RequestOutput] - finished: bool @classmethod def new(cls) -> "RequestState": - return cls(asyncio.Event(), [], False) + return cls(asyncio.Event(), []) class AsyncLLM(EngineClient): @@ -261,7 +267,7 @@ async def generate( continue state.out_list = [] - if state.finished: + if out.finished: del self.rid_to_state[request_id] yield out break @@ -308,8 +314,6 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): # Update the RequestState and alert generate() that there # is a RequestOutput ready to return to the user. state = self.rid_to_state[request_output.request_id] - if request_output.finished: - state.finished = True state.out_list.append(request_output) state.event.set() From c987a763e53917064e6fa3a3ff08fb00fa2d0e66 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:39:05 +0000 Subject: [PATCH 017/132] update comment --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index e03d502c258b8..1cac46bc7d418 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -32,7 +32,7 @@ class RequestState: """ RequestState manages concurrency between: * the output_handler(), which pulls outputs from EngineCore - * the per-request generate(), which is the interface to client code. + * the per-request generate(), which yields to the API server The output_handler adds new RequestOutputs to out_list and sets the asyncio event, notifying the generate() that there is work to do. From f3ff0e070507eba56f8d67dd40b59e2a4edfc550 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:40:01 +0000 Subject: [PATCH 018/132] format --- vllm/entrypoints/openai/serving_completion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index f8f00d53bd30f..7a40c312f49ce 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -249,7 +249,8 @@ async def completion_stream_generator( """ In V0, we use RequestOutputType.DELTA and each RequestOutput from the result_generator is guaranteed to correspond to - a single token so + a single token so can construct the outputs without needing + to maintain any state. In V1, we use RequestOutputType.CUMULATIVE and each RequestOutput from the result_generator is not guaranteed to correspond to From fbf647f5308451539f84fcc86c77d889d5ac4985 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:41:20 +0000 Subject: [PATCH 019/132] updated examples --- examples/openai_completion_client.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index bbc6f27ece2fe..205755da1c34f 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -2,33 +2,29 @@ # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8001/v1" +openai_api_base = "http://localhost:8000/v1" client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") api_key=openai_api_key, base_url=openai_api_base, ) - models = client.models.list() model = models.data[0].id # Completion API -stream = True +stream = False completion = client.completions.create( model=model, prompt="A robot may not injure a human being", - echo=True, - n=1, - logprobs=2, - stream=stream) + echo=False, + n=2, + stream=stream, + logprobs=3) print("Completion results:") -text = "" if stream: for c in completion: - text += c.choices[0].text print(c) - print(text) else: print(completion) From b1105b926e648c4646fe843951d07353b25bc14e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:42:15 +0000 Subject: [PATCH 020/132] more cleaning --- benchmarks/backend_request_func.py | 2 -- benchmarks/benchmark_serving.py | 6 ------ 2 files changed, 8 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 7b324b9e9897a..b67849038cf0d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -40,7 +40,6 @@ class RequestFuncOutput: tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" - total_chunks: int = 0 async def async_request_tgi( @@ -270,7 +269,6 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - output.total_chunks += 1 if chunk == "[DONE]": latency = time.perf_counter() - st else: diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index e9a9ef366004a..4eb0e1f8ac903 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -630,12 +630,6 @@ async def limited_request_func(request_func_input, pbar): pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) - total_chunks = 0 - for output in outputs: - total_chunks += output.total_chunks - - print(f"TOTAL_CHUNKS: {total_chunks}") - if profile: print("Stopping profiler...") profile_input = RequestFuncInput( From ea7289bb43c6e5db534bc8c3a31ecd451cd080c5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 16 Dec 2024 23:42:36 +0000 Subject: [PATCH 021/132] make pr smaller --- examples/openai_completion_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index 205755da1c34f..58519f978d340 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -9,6 +9,7 @@ api_key=openai_api_key, base_url=openai_api_base, ) + models = client.models.list() model = models.data[0].id From 06dcb1b8be638e1e03df355ae775cf938a2efe4e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 17 Dec 2024 02:19:44 +0000 Subject: [PATCH 022/132] updated --- examples/openai_completion_client.py | 2 +- vllm/v1/engine/core.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index 58519f978d340..2c8525fd392e1 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -14,7 +14,7 @@ model = models.data[0].id # Completion API -stream = False +stream = True completion = client.completions.create( model=model, prompt="A robot may not injure a human being", diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 470ad80ee045c..9e8b47d39463f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -32,7 +32,7 @@ POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = 5 +LOGGING_TIME_S = 1 class EngineCore: From 9628575d6fea679ed6b023d44d582ce0db330532 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 17 Dec 2024 02:21:10 +0000 Subject: [PATCH 023/132] added log --- vllm/v1/engine/async_llm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 1cac46bc7d418..a05cf775abaac 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -259,6 +259,8 @@ async def generate( # a delta text. This way we do "dynamic chunked streaming", such # that the API client does not fall behind the EngineCor, # which happens at high QPS otherwise. + if len(state.out_list) > 1: + logger.info(f"{len(state.out_list)=}") out = state.out_list[-1] except asyncio.TimeoutError: From 5d824dff040492c7bf3052b269cedfd56c33893e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 17 Dec 2024 03:03:19 +0000 Subject: [PATCH 024/132] remove log --- vllm/v1/engine/async_llm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a05cf775abaac..1cac46bc7d418 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -259,8 +259,6 @@ async def generate( # a delta text. This way we do "dynamic chunked streaming", such # that the API client does not fall behind the EngineCor, # which happens at high QPS otherwise. - if len(state.out_list) > 1: - logger.info(f"{len(state.out_list)=}") out = state.out_list[-1] except asyncio.TimeoutError: From 26814f1d077ab7d3f0202f5dd44cf98d2a391d32 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 17 Dec 2024 03:14:45 +0000 Subject: [PATCH 025/132] updated --- vllm/v1/engine/async_llm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 1cac46bc7d418..aec24c926ce1e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -309,7 +309,9 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): """Process outputs by putting them into per-request AsyncStreams.""" for request_output in request_outputs: - assert request_output.request_id in self.rid_to_state + if request_output.request_id not in self.rid_to_state: + raise RuntimeError(f"{request_output.request_id} " + "not in RequestStates") # Update the RequestState and alert generate() that there # is a RequestOutput ready to return to the user. From 1205764b975c4bf438b93ec155d6d86d84460b25 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 17 Dec 2024 16:54:57 +0000 Subject: [PATCH 026/132] Stash --- vllm/entrypoints/openai/serving_completion.py | 10 ++++++--- vllm/v1/engine/async_llm.py | 7 ++++--- vllm/v1/engine/core.py | 2 +- vllm/v1/engine/core_client.py | 9 +++++--- vllm/v1/engine/detokenizer.py | 21 ++++++++++++++++--- vllm/v1/utils.py | 16 +++++++++++++- 6 files changed, 51 insertions(+), 14 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 7a40c312f49ce..3b0bc2fa897c6 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -160,8 +160,10 @@ async def create_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - result_generator = merge_async_iterators( - *generators, is_cancelled=raw_request.is_disconnected) + # result_generator = merge_async_iterators( + # *generators, is_cancelled=raw_request.is_disconnected) + assert len(generators) == 1 + result_generator = generators[0] model_name = self._get_model_name(lora_request) num_prompts = len(engine_prompts) @@ -284,7 +286,9 @@ async def completion_stream_generator( include_usage, include_continuous_usage = False, False try: - async for prompt_idx, res in result_generator: + # async for prompt_idx, res in result_generator: + async for res in result_generator: + prompt_idx = 0 prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs prompt_text = res.prompt diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b3e25597f4499..5f2f596efd1d9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -184,9 +184,6 @@ async def add_request( ) -> RequestState: """Add new request to the AsyncLLM.""" - # if self.detokenizer.is_request_active(request_id): - # raise ValueError(f"Request {request_id} already exists.") - # 1) Add to RequestState tracker. The "event" is used to manage # concurrency between generate() and output_handler(). self.rid_to_state[request_id] = RequestState.new() @@ -197,6 +194,8 @@ async def add_request( trace_headers, prompt_adapter_request, priority) # 3) Add the DetokenizerRequest to Detokenizer. + # TODO: sending these separately is a race condition. We should instead + # have the EngineCore do the "AddRequest" logic. await self.detokenizer.add_request_async(detokenizer_req) # 4) Add the EngineCoreRequest to EngineCore. @@ -268,6 +267,8 @@ async def generate( # that the API client does not fall behind the EngineCor, # which happens at high QPS otherwise. out = state.out_list[-1] + if len(state.out_list) > 10: + logger.info(f"{len(state.out_list)=}") except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 30422891413c6..395efaf3bc017 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -33,7 +33,7 @@ POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = 5 +LOGGING_TIME_S = 1 class EngineCore: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index cfd3edab13877..01798dbee493e 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -130,8 +130,8 @@ class MPClient(EngineCoreClient): def __init__( self, *args, - output_path: str, asyncio_mode: bool, + output_path: Optional[str] = None, **kwargs, ): # Serialization setup. @@ -140,9 +140,9 @@ def __init__( # ZMQ setup. if asyncio_mode: - self.ctx = zmq.asyncio.Context() + self.ctx = zmq.asyncio.Context(io_threads=2) else: - self.ctx = zmq.Context() # type: ignore[attr-defined] + self.ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] input_path = get_open_zmq_ipc_path() self.input_socket = make_zmq_socket( @@ -151,6 +151,9 @@ def __init__( zmq.constants.PUSH, ) + if output_path is None: + output_path = get_open_zmq_ipc_path() + # Start EngineCore in background process. self.proc_handle: Optional[BackgroundProcHandle] self.proc_handle = EngineCoreProc.make_engine_core_process( diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index a1c384cd45967..67e3490618a32 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -13,7 +13,7 @@ from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import get_open_zmq_ipc_path +from vllm.utils import get_open_zmq_ipc_path, kill_process_tree from vllm.v1.engine import (DetokenizerRequest, DetokenizerRequestType, EngineCoreOutput, EngineCoreOutputs, BackgroundProcHandle,) @@ -386,6 +386,10 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the Detokenizer.""" + log_interval = 0 + import time + + last_log = time.perf_counter() try: # TODO: handle aborted due to client cancellation # TODO: pickle -> msgpack @@ -420,7 +424,11 @@ def run_busy_loop(self): engine_core_outputs = decoder_out.decode(frame.buffer).outputs outputs = self.step(engine_core_outputs) msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL) - output_socket.send_multipart((msg, ), copy=False) + # now = time.perf_counter() + # if now - last_log > 0.1: + # logger.info("Detok: Sending") + # last_log = now + output_socket.send_multipart((msg, ), copy=False) except Exception as e: logger.error(e) @@ -435,7 +443,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): self.decoder = PickleEncoder() # ZMQ setup. - self.ctx = zmq.asyncio.Context() + self.ctx = zmq.asyncio.Context(io_threads=2) # Get input (DetokenizerRequest) to Detokenizer. input_path = get_open_zmq_ipc_path() @@ -463,6 +471,13 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): **kwargs, ) + def shutdown(self): + self.proc_handle.proc.terminate() + self.proc_handle.proc.join(5) + + if self.proc_handle.proc.is_alive(): + kill_process_tree(self.proc_handle.proc.pid) + async def add_request_async(self, request: DetokenizerRequest): """Send new DetokenizerRequest to Detokenizer.""" diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index ecd98f246b064..7d748f6cfee6d 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -88,11 +88,25 @@ def make_zmq_socket( ) -> Union[zmq.Socket, zmq.asyncio.Socket]: """Make a ZMQ socket with the proper bind/connext semantics.""" + import psutil + mem = psutil.virtual_memory() + socket = ctx.socket(type) + + total_mem = mem.total / 1024**3 + available_mem = mem.available / 1024**3 + if total_mem > 32 and available_mem > 16: + buf_size = int(0.5 * 1024**3) + else: + buf_size = -1 if type == zmq.constants.PULL: + socket.setsockopt(zmq.RCVHWM, 0) + socket.setsockopt(zmq.RCVBUF, buf_size) socket.connect(path) elif type == zmq.constants.PUSH: + socket.setsockopt(zmq.SNDHWM, 0) + socket.setsockopt(zmq.SNDBUF, buf_size) socket.bind(path) else: raise ValueError(f"Unknown Socket Type: {type}") @@ -105,7 +119,7 @@ def zmq_socket_ctx( type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] """Context manager for a ZMQ socket""" - ctx = zmq.Context() # type: ignore[attr-defined] + ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] try: yield make_zmq_socket(ctx, path, type) From 73da178500642e25a503d5c75fa7c7913407dbe5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 17 Dec 2024 22:14:46 +0000 Subject: [PATCH 027/132] stash --- benchmarks/benchmark_throughput.py | 21 ++++++++++++------- vllm/v1/engine/async_llm.py | 33 ++++++++++++++++++++++++------ vllm/v1/engine/core.py | 3 +++ vllm/v1/engine/detokenizer.py | 15 +++++++++++--- 4 files changed, 56 insertions(+), 16 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 1e5967bd9bf8b..0926cec29a907 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -170,7 +170,7 @@ def run_vllm( end = time.perf_counter() return end - start - +import asyncio async def run_vllm_async( requests: List[SampleRequest], n: int, @@ -198,17 +198,23 @@ async def run_vllm_async( max_tokens=request.expected_output_len, )) - generators = [] + tasks = [] start = time.perf_counter() for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): generator = llm.generate(prompt, sp, request_id=f"test{i}") - generators.append(generator) - all_gens = merge_async_iterators(*generators) - async for i, res in all_gens: - pass + tasks.append(run(generator)) + # all_gens = merge_async_iterators(*generators) + # async for i, res in all_gens: + # pass + + await asyncio.gather(*tasks) + end = time.perf_counter() return end - start +async def run(generator): + async for res in generator: + pass def run_hf( requests: List[SampleRequest], @@ -331,7 +337,8 @@ def main(args: argparse.Namespace): for request in requests) if args.backend == "vllm": if args.async_engine: - elapsed_time = uvloop.run( + asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + elapsed_time = asyncio.run( run_vllm_async( requests, args.n, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 5f2f596efd1d9..def445c155f83 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -110,7 +110,8 @@ def __init__( usage_context=usage_context, ) - self.output_handler: Optional[asyncio.Task] = None + # self.output_handler: Optional[asyncio.Task] = None + self.to_create_loop = True def __del__(self): self.shutdown() @@ -239,9 +240,15 @@ async def generate( # We start the output_handler on the first call to generate() so that # we can call __init__ before the event loop starts, which enables us # to handle startup failure gracefully in the OpenAI server. - if self.output_handler is None: - self.output_handler = asyncio.create_task( - self._run_output_handler()) + # if self.output_handler is None: + if self.to_create_loop: + loop = asyncio.get_event_loop() + print(f"{loop=}") + loop.create_task(self._run_output_handler()) + self.to_create_loop = False + # self.output_handler = asyncio.create_task( + # self._run_output_handler()) + state = await self.add_request( request_id, @@ -257,6 +264,7 @@ async def generate( try: await asyncio.wait_for(state.event.wait(), timeout=WAITING_TIMEOUT_MS) + logger.info(f"{request_id} woke up.") # NOTE(rob): out_list can have more than one item. However, # in the streaming case, we use RequestOutputKind.CUMULATIVE, @@ -272,7 +280,7 @@ async def generate( except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. - logger.debug("Timeout waiting for %s", request_id) + logger.info("Timeout waiting for %s", request_id) continue state.out_list = [] @@ -330,23 +338,36 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + # idx = 0 + # import pyinstrument + + # prof = pyinstrument.Profiler() + # prof.start() + # i = 0 try: while True: # 1) Pull outputs from the Detokenizer. request_outputs, reqs_to_abort = ( await self.detokenizer.get_output_async()) + logger.info("AsyncLLM") + # logger.info(f"RECV: {idx}") + # idx+=1 # 2) Put the RequestOutputs into the per-request AsyncStreams. self._process_request_outputs(request_outputs) # 3) Abort any requests that finished due to stop strings. - await self.engine_core.abort_requests_async(reqs_to_abort) + # await self.engine_core.abort_requests_async(reqs_to_abort) # 4) Abort any requests due to client cancellations. # TODO: send back to detokenizer if this fails. # await self._process_cancellations() + # except KeyboardInterrupt: + # prof.stop() + # prof.write_html("output_handler.prof") + except Exception as e: logger.error(e) raise e diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 395efaf3bc017..e36af368568e0 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -334,10 +334,13 @@ def process_output_socket(self, output_path: str): buffer = bytearray() with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: + idx = 0 while True: engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) encoder.encode_into(outputs, buffer) # msg = (DetokenizerRequestType.OUT.value, buffer) msg = (buffer, ) + # logger.info(f"SEND: {idx}: {len(engine_core_outputs)}") + # idx += 1 socket.send_multipart(msg, copy=False) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 67e3490618a32..f989c12b89552 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -409,6 +409,7 @@ def run_busy_loop(self): poller.register(engine_core_outputs_socket, zmq.POLLIN) poller.register(input_socket, zmq.POLLIN) + # idx = 0 while True: socks = dict(poller.poll()) @@ -428,12 +429,16 @@ def run_busy_loop(self): # if now - last_log > 0.1: # logger.info("Detok: Sending") # last_log = now + # logger.info(f"SEND: {idx}: {len(engine_core_outputs)}") + # idx += 1 output_socket.send_multipart((msg, ), copy=False) except Exception as e: logger.error(e) raise e - + +import time + class DetokenizerClient: def __init__(self, *args, engine_core_outputs_path: str, **kwargs): @@ -484,9 +489,13 @@ async def add_request_async(self, request: DetokenizerRequest): msg = (self.encoder.encode(request), ) await self.input_socket.send_multipart(msg, copy=False) - async def get_output_async(self) -> Tuple[List[RequestOutput], List[str]]: """Get RequestOutputs, RequestsToAbort from Detokenizer.""" (frame, ) = await self.output_socket.recv_multipart(copy=False) - return self.decoder.decode(frame.buffer) + start = time.perf_counter() + out = self.decoder.decode(frame.buffer) + end = time.perf_counter() + if end - start > 0.1: + logger.info(f"{end - start}") + return out From 9830fbe1a7286744b040ee8d49359075c0eab317 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 18 Dec 2024 15:04:36 +0000 Subject: [PATCH 028/132] stash --- vllm/v1/engine/__init__.py | 24 +++++++++++++ vllm/v1/engine/async_llm.py | 37 +++++++------------- vllm/v1/engine/detokenizer.py | 65 ++++++++++++++++++----------------- 3 files changed, 70 insertions(+), 56 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index ee6b90b1bab1f..089ad1052e5f2 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -35,6 +35,30 @@ class DetokenizerRequest( include_stop_str_in_output: bool +class DetokenizerOutput( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False): # type: ignore[call-arg] + + request_id: str + text: str + finished: bool + + +class DetokenizerOutputs( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False): # type: ignore[call-arg] + + #NOTE(Nick): We could consider ways to make this more compact, + # e.g. columnwise layout and using an int enum for finish/stop reason + + # [num_reqs] + outputs: List[DetokenizerOutput] + + @dataclass class EngineCoreRequest: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index def445c155f83..8b18f34b5d85c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -243,12 +243,8 @@ async def generate( # if self.output_handler is None: if self.to_create_loop: loop = asyncio.get_event_loop() - print(f"{loop=}") loop.create_task(self._run_output_handler()) self.to_create_loop = False - # self.output_handler = asyncio.create_task( - # self._run_output_handler()) - state = await self.add_request( request_id, @@ -264,7 +260,7 @@ async def generate( try: await asyncio.wait_for(state.event.wait(), timeout=WAITING_TIMEOUT_MS) - logger.info(f"{request_id} woke up.") + # logger.info(f"{request_id} woke up.") # NOTE(rob): out_list can have more than one item. However, # in the streaming case, we use RequestOutputKind.CUMULATIVE, @@ -322,20 +318,6 @@ async def _process_cancellations(self) -> None: # Remove from EngineCore. await self.engine_core.abort_requests_async(reqs_to_abort) - def _process_request_outputs(self, request_outputs: List[RequestOutput]): - """Process outputs by putting them into per-request AsyncStreams.""" - - for request_output in request_outputs: - if request_output.request_id not in self.rid_to_state: - raise RuntimeError(f"{request_output.request_id} " - "not in RequestStates") - - # Update the RequestState and alert generate() that there - # is a RequestOutput ready to return to the user. - state = self.rid_to_state[request_output.request_id] - state.out_list.append(request_output) - state.event.set() - async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" # idx = 0 @@ -348,14 +330,21 @@ async def _run_output_handler(self): try: while True: # 1) Pull outputs from the Detokenizer. - request_outputs, reqs_to_abort = ( - await self.detokenizer.get_output_async()) - logger.info("AsyncLLM") + detokenizer_outputs = ( + await self.detokenizer.get_output_async()).outputs # logger.info(f"RECV: {idx}") # idx+=1 - # 2) Put the RequestOutputs into the per-request AsyncStreams. - self._process_request_outputs(request_outputs) + for out in detokenizer_outputs: + if out.request_id not in self.rid_to_state: + raise RuntimeError(f"{out.request_id} " + "not in RequestStates") + + # Update the RequestState and alert generate() that there + # is a RequestOutput ready to return to the user. + state = self.rid_to_state[out.request_id] + state.out_list.append(out) + state.event.set() # 3) Abort any requests that finished due to stop strings. # await self.engine_core.abort_requests_async(reqs_to_abort) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index f989c12b89552..76fa3a595f72b 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -14,7 +14,8 @@ AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import get_open_zmq_ipc_path, kill_process_tree -from vllm.v1.engine import (DetokenizerRequest, DetokenizerRequestType, +from vllm.v1.engine import (DetokenizerRequest, DetokenizerOutputs, + DetokenizerOutput, EngineCoreOutput, EngineCoreOutputs, BackgroundProcHandle,) from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, @@ -251,10 +252,11 @@ def add_request( def step( self, encore_core_outputs: List[EngineCoreOutput] - ) -> Tuple[List[RequestOutput], List[str]]: + ) -> DetokenizerOutputs: """Update state and request the RequestOutputs to the LLMEngine.""" - request_outputs: List[RequestOutput] = [] + # request_outputs: List[RequestOutput] = [] + detokenizer_outputs = DetokenizerOutputs(outputs=[]) requests_to_abort: List[str] = [] for engine_core_output in encore_core_outputs: request_id = engine_core_output.request_id @@ -269,22 +271,30 @@ def step( finish_reason=engine_core_output.finish_reason, stop_reason=engine_core_output.stop_reason, ) - + if request_output is not None: - # Add to RequestOutputs list. - request_outputs.append(request_output) - - # Free completed requests. - if request_output.finished: - self.request_states.pop(request_id) - # If Request finished but EngineCore not finished, - # this was caused by a stop string + we need to send - # an abort signal to the EngineCore. - if not engine_core_output.finished: - requests_to_abort.append(request_id) + detokenizer_outputs.outputs.append( + DetokenizerOutput( + request_id=request_id, + text=request_output.outputs[0].text, + finished=request_output.finished, + ) + ) + # # Add to RequestOutputs list. + # request_outputs.append(request_output) + + # # Free completed requests. + # if request_output.finished: + # self.request_states.pop(request_id) + # # If Request finished but EngineCore not finished, + # # this was caused by a stop string + we need to send + # # an abort signal to the EngineCore. + # if not engine_core_output.finished: + # requests_to_abort.append(request_id) # Return to EngineClient. - return request_outputs, requests_to_abort + # return request_outputs, requests_to_abort + return detokenizer_outputs, [] class DetokenizerProc(Detokenizer): """ZMQ-wrapper for running Detokenizer in background process.""" @@ -397,6 +407,7 @@ def run_busy_loop(self): decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest) decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs) + encoder = msgspec.msgpack.Encoder() with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket, @@ -423,14 +434,8 @@ def run_busy_loop(self): if engine_core_outputs_socket in socks: (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False) engine_core_outputs = decoder_out.decode(frame.buffer).outputs - outputs = self.step(engine_core_outputs) - msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL) - # now = time.perf_counter() - # if now - last_log > 0.1: - # logger.info("Detok: Sending") - # last_log = now - # logger.info(f"SEND: {idx}: {len(engine_core_outputs)}") - # idx += 1 + detokenizer_outputs, _ = self.step(engine_core_outputs) + msg = encoder.encode(detokenizer_outputs) output_socket.send_multipart((msg, ), copy=False) except Exception as e: @@ -445,7 +450,8 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): # Serialization setup. self.encoder = msgspec.msgpack.Encoder() - self.decoder = PickleEncoder() + # self.decoder = PickleEncoder() + self.decoder = msgspec.msgpack.Decoder(DetokenizerOutputs) # ZMQ setup. self.ctx = zmq.asyncio.Context(io_threads=2) @@ -489,13 +495,8 @@ async def add_request_async(self, request: DetokenizerRequest): msg = (self.encoder.encode(request), ) await self.input_socket.send_multipart(msg, copy=False) - async def get_output_async(self) -> Tuple[List[RequestOutput], List[str]]: + async def get_output_async(self) -> DetokenizerOutputs: """Get RequestOutputs, RequestsToAbort from Detokenizer.""" (frame, ) = await self.output_socket.recv_multipart(copy=False) - start = time.perf_counter() - out = self.decoder.decode(frame.buffer) - end = time.perf_counter() - if end - start > 0.1: - logger.info(f"{end - start}") - return out + return self.decoder.decode(frame.buffer) From 661ee446710cdfcac6c713c2f08efeb500f4af23 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 18 Dec 2024 15:09:33 +0000 Subject: [PATCH 029/132] stash --- vllm/v1/engine/async_llm.py | 12 ------------ vllm/v1/engine/core.py | 6 +++--- vllm/v1/engine/core_client.py | 2 +- vllm/v1/engine/detokenizer.py | 12 ++++++------ vllm/v1/executor/multiproc_executor.py | 4 ++-- vllm/v1/utils.py | 6 +++--- 6 files changed, 15 insertions(+), 27 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 8b18f34b5d85c..403562fc4f43c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -320,20 +320,12 @@ async def _process_cancellations(self) -> None: async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" - # idx = 0 - # import pyinstrument - - # prof = pyinstrument.Profiler() - # prof.start() - # i = 0 try: while True: # 1) Pull outputs from the Detokenizer. detokenizer_outputs = ( await self.detokenizer.get_output_async()).outputs - # logger.info(f"RECV: {idx}") - # idx+=1 for out in detokenizer_outputs: if out.request_id not in self.rid_to_state: @@ -353,10 +345,6 @@ async def _run_output_handler(self): # TODO: send back to detokenizer if this fails. # await self._process_cancellations() - # except KeyboardInterrupt: - # prof.stop() - # prof.write_html("output_handler.prof") - except Exception as e: logger.error(e) raise e diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e36af368568e0..2eff8a8adb230 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -166,7 +166,7 @@ def __init__( daemon=True).start() # Send Readiness signal to EngineClient. - with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: + with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket: ready_socket.send_string(EngineCoreProc.READY_STR) @staticmethod @@ -305,7 +305,7 @@ def process_input_socket(self, input_path: str): decoder_add_req = PickleEncoder() decoder_abort_req = PickleEncoder() - with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(input_path, zmq.PULL) as socket: while True: # (RequestType, RequestData) type_frame, data_frame = socket.recv_multipart(copy=False) @@ -333,7 +333,7 @@ def process_output_socket(self, output_path: str): # Reuse send buffer. buffer = bytearray() - with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: + with zmq_socket_ctx(output_path, zmq.PUSH) as socket: idx = 0 while True: engine_core_outputs = self.output_queue.get() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 01798dbee493e..4c0745060bc02 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -148,7 +148,7 @@ def __init__( self.input_socket = make_zmq_socket( self.ctx, input_path, - zmq.constants.PUSH, + zmq.PUSH, ) if output_path is None: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 76fa3a595f72b..43930cbfb8eab 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -317,7 +317,7 @@ def __init__( self.output_path = output_path # Send readiness signal. - with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: + with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket: ready_socket.send_string(DetokenizerProc.READY_STR) @@ -409,9 +409,9 @@ def run_busy_loop(self): decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs) encoder = msgspec.msgpack.Encoder() - with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, - zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket, - zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket): + with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as engine_core_outputs_socket, + zmq_socket_ctx(self.input_path, zmq.PULL) as input_socket, + zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket): # TODO: avoid poll by having both EngineCore # and AsyncLLM send to the same socket (unclear why this @@ -461,7 +461,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): self.input_socket = make_zmq_socket( self.ctx, input_path, - zmq.constants.PUSH, + zmq.PUSH, ) # Get output (RequestOutput) from Detokenizer. @@ -469,7 +469,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): self.output_socket = make_zmq_socket( self.ctx, output_path, - zmq.constants.PULL, + zmq.PULL, ) # Start Detokenizer in background process. diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index aa246f778f8f3..12fe559341931 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -254,7 +254,7 @@ def __init__( worker_response_mq_handle = self.worker_response_mq.export_handle() # Send Readiness signal to EngineCore process. - with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: + with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket: payload = pickle.dumps(worker_response_mq_handle, protocol=pickle.HIGHEST_PROTOCOL) ready_socket.send_string(WorkerProc.READY_STR) @@ -356,7 +356,7 @@ def wait_for_startup( ready_path: str, ) -> Optional[Handle]: """Wait until the Worker is ready.""" - with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(ready_path, zmq.PULL) as socket: # Wait for Worker to send READY. while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 7d748f6cfee6d..d438985736ba5 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -100,11 +100,11 @@ def make_zmq_socket( else: buf_size = -1 - if type == zmq.constants.PULL: + if type == zmq.PULL: socket.setsockopt(zmq.RCVHWM, 0) socket.setsockopt(zmq.RCVBUF, buf_size) socket.connect(path) - elif type == zmq.constants.PUSH: + elif type == zmq.PUSH: socket.setsockopt(zmq.SNDHWM, 0) socket.setsockopt(zmq.SNDBUF, buf_size) socket.bind(path) @@ -138,7 +138,7 @@ def wait_for_startup( ) -> None: """Wait until a background process is ready.""" - with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(ready_path, zmq.PULL) as socket: try: while socket.poll(timeout=timeout_ms) == 0: logger.debug("Waiting for background proc to startup.") From 6f1252547189b78e7c700ea66345642fcf1ac6b3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 18 Dec 2024 15:13:25 +0000 Subject: [PATCH 030/132] stash --- vllm/v1/engine/async_llm.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 403562fc4f43c..5c4c7f49de4de 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -25,9 +25,6 @@ logger = init_logger(__name__) -WAITING_TIMEOUT_MS = 5 - - @dataclass class RequestState: """ @@ -259,7 +256,7 @@ async def generate( while True: try: await asyncio.wait_for(state.event.wait(), - timeout=WAITING_TIMEOUT_MS) + timeout=4) # logger.info(f"{request_id} woke up.") # NOTE(rob): out_list can have more than one item. However, @@ -271,12 +268,12 @@ async def generate( # that the API client does not fall behind the EngineCor, # which happens at high QPS otherwise. out = state.out_list[-1] - if len(state.out_list) > 10: + if len(state.out_list) > 1: logger.info(f"{len(state.out_list)=}") except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. - logger.info("Timeout waiting for %s", request_id) + logger.debug("Timeout waiting for %s", request_id) continue state.out_list = [] From fd91f4b085ef1fa89ed8505d824c4fc042983189 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 18 Dec 2024 16:14:22 +0000 Subject: [PATCH 031/132] stash --- vllm/v1/core/scheduler.py | 2 ++ vllm/v1/engine/__init__.py | 3 +++ vllm/v1/engine/async_llm.py | 8 +++--- vllm/v1/engine/detokenizer.py | 49 ++++++++++++++++++++++++++++++++--- 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index b44d72afae94a..23dc4ef298fea 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -428,6 +428,8 @@ def update_from_output( # Add EngineCoreOutput for this Request. output = EngineCoreOutput( request_id=req_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, new_token_ids=request.output_token_ids[-num_new_tokens:], finished=request.is_finished(), finish_reason=request.get_finished_reason(), diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 089ad1052e5f2..0de28b0b76f0c 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -87,10 +87,13 @@ class EngineCoreOutput( gc=False): # type: ignore[call-arg] request_id: str + prompt_token_ids: List[int] + prompt: str new_token_ids: List[int] finished: bool finish_reason: Optional[str] = None stop_reason: Union[int, str, None] = None + class EngineCoreOutputs( diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 5c4c7f49de4de..20eca6ac75a8b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -187,14 +187,14 @@ async def add_request( self.rid_to_state[request_id] = RequestState.new() # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. - detokenizer_req, engine_core_req = self.processor.process_inputs( + _, engine_core_req = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) # 3) Add the DetokenizerRequest to Detokenizer. # TODO: sending these separately is a race condition. We should instead # have the EngineCore do the "AddRequest" logic. - await self.detokenizer.add_request_async(detokenizer_req) + # await self.detokenizer.add_request_async(detokenizer_req) # 4) Add the EngineCoreRequest to EngineCore. await self.engine_core.add_request_async(engine_core_req) @@ -268,8 +268,8 @@ async def generate( # that the API client does not fall behind the EngineCor, # which happens at high QPS otherwise. out = state.out_list[-1] - if len(state.out_list) > 1: - logger.info(f"{len(state.out_list)=}") + # if len(state.out_list) > 1: + # logger.info(f"{len(state.out_list)=}") except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 43930cbfb8eab..e9a5a00d696b1 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -106,6 +106,37 @@ def from_new_request( stop_buffer_length=stop_buffer_length, ) + @classmethod + def from_eco( + cls, + tokenizer: AnyTokenizer, + eco: EngineCoreOutput, + ): + tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=eco.prompt_token_ids, + skip_special_tokens=True, + ) + + return cls( + output_text="", + tokens=tokens, + token_ids=eco.prompt_token_ids, + stop=[], + include_stop_str_in_output=False, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=True, + spaces_between_special_tokens=True, + output_kind=RequestOutputKind.CUMULATIVE, + request_id=eco.request_id, + prompt=eco.prompt, + prompt_token_ids=eco.prompt_token_ids, + tokenizer=tokenizer, + stop_buffer_length=0, + ) + + def add_tokens( self, new_token_ids: List[int], @@ -158,8 +189,6 @@ def add_tokens( finish_reason = "stop" # TODO: use constant stop_reason = stop_str - # TODO: handle stop_token_ids here too? - # 3) Update the RequestOutput object with the new text. finished = bool(finish_reason) if self.output_kind == RequestOutputKind.FINAL_ONLY \ @@ -250,16 +279,30 @@ def add_request( self.tokenizer, request) self.request_states[request.request_id] = request_state + def add_request_eco( + self, + eco: EngineCoreOutput, + ): + request_state = IncrementalDetokenizer.from_eco( + self.tokenizer, eco) + self.request_states[eco.request_id] = request_state + + def step( self, encore_core_outputs: List[EngineCoreOutput] ) -> DetokenizerOutputs: """Update state and request the RequestOutputs to the LLMEngine.""" # request_outputs: List[RequestOutput] = [] + # requests_to_abort: List[str] = [] detokenizer_outputs = DetokenizerOutputs(outputs=[]) - requests_to_abort: List[str] = [] + for engine_core_output in encore_core_outputs: request_id = engine_core_output.request_id + + if request_id not in self.request_states: + self.add_request_eco(engine_core_output) + detokenizer = self.request_states.get(request_id) if detokenizer is None: # Ignore output for already-aborted request. From 6c99a4f33e2adc9d3f07fafaff2de7c8a10d93ce Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 18 Dec 2024 17:10:04 +0000 Subject: [PATCH 032/132] stash --- vllm/outputs.py | 25 +++++++++ vllm/v1/engine/__init__.py | 1 + vllm/v1/engine/async_llm.py | 95 +++++++++++++++++++++-------------- vllm/v1/engine/core_client.py | 2 +- vllm/v1/engine/detokenizer.py | 1 + 5 files changed, 84 insertions(+), 40 deletions(-) diff --git a/vllm/outputs.py b/vllm/outputs.py index 2ecdf74ee59b3..9a4b4353deb1d 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -12,6 +12,7 @@ from vllm.sampling_params import RequestOutputKind from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, SequenceGroup, SequenceGroupBase, SequenceStatus) +from vllm.v1.engine import DetokenizerOutput @dataclass @@ -132,6 +133,30 @@ def __init__( self.encoder_prompt_token_ids = encoder_prompt_token_ids self.num_cached_tokens = num_cached_tokens + @classmethod + def from_detok( + cls, + prompt: str, + prompt_token_ids: List[int], + detok_output: DetokenizerOutput, + ): + completion_output = CompletionOutput( + index=0, + text=detok_output.text, + token_ids=detok_output.token_ids, + cumulative_logprob=None, + logprobs=None, # TODO + ) + + return RequestOutput( + request_id=detok_output.request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, # TODO + outputs=[completion_output], + finished=detok_output.finished, + ) + @classmethod def new( cls, diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 0de28b0b76f0c..860662602160a 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -43,6 +43,7 @@ class DetokenizerOutput( request_id: str text: str + token_ids: List[int] finished: bool diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 20eca6ac75a8b..8148d28de1bf1 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -25,6 +25,9 @@ logger = init_logger(__name__) +import uvloop +asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + @dataclass class RequestState: """ @@ -39,12 +42,14 @@ class RequestState: out_list back to the caller generate() """ + prompt: str + prompt_token_ids: List[int] event: asyncio.Event out_list: List[RequestOutput] @classmethod - def new(cls) -> "RequestState": - return cls(asyncio.Event(), []) + def new(cls, prompt, prompt_token_ids) -> "RequestState": + return cls(prompt, prompt_token_ids, asyncio.Event(), []) class AsyncLLM(EngineClient): @@ -63,6 +68,7 @@ def __init__( ) -> None: assert start_engine_loop + self.warned = False self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers = stat_loggers @@ -182,14 +188,18 @@ async def add_request( ) -> RequestState: """Add new request to the AsyncLLM.""" - # 1) Add to RequestState tracker. The "event" is used to manage - # concurrency between generate() and output_handler(). - self.rid_to_state[request_id] = RequestState.new() + + # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. _, engine_core_req = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) + + # 1) Add to RequestState tracker. The "event" is used to manage + # concurrency between generate() and output_handler(). + self.rid_to_state[request_id] = RequestState.new(prompt, + engine_core_req.prompt_token_ids) # 3) Add the DetokenizerRequest to Detokenizer. # TODO: sending these separately is a race condition. We should instead @@ -231,8 +241,8 @@ async def generate( """ # DELTA streaming is not supported due to dynamic chunking. - assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE - or sampling_params.output_kind == RequestOutputKind.FINAL_ONLY) + assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE or + sampling_params.output_kind == RequestOutputKind.FINAL_ONLY) # We start the output_handler on the first call to generate() so that # we can call __init__ before the event loop starts, which enables us @@ -255,9 +265,7 @@ async def generate( while True: try: - await asyncio.wait_for(state.event.wait(), - timeout=4) - # logger.info(f"{request_id} woke up.") + await asyncio.wait_for(state.event.wait(), timeout=4) # NOTE(rob): out_list can have more than one item. However, # in the streaming case, we use RequestOutputKind.CUMULATIVE, @@ -268,8 +276,9 @@ async def generate( # that the API client does not fall behind the EngineCor, # which happens at high QPS otherwise. out = state.out_list[-1] - # if len(state.out_list) > 1: - # logger.info(f"{len(state.out_list)=}") + if len(state.out_list) > 2 and not self.warned: + logger.info(f"{len(state.out_list)=}") + self.warned = True except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. @@ -285,35 +294,35 @@ async def generate( state.event.clear() yield out - async def _process_cancellations(self) -> None: - """ - Process requests cancelled from user disconnecting. + # async def _process_cancellations(self) -> None: + # """ + # Process requests cancelled from user disconnecting. - When a client disconnects, AsyncStream._cancel() is called. - We passed a callback to AsyncStream(), which appends to - self.client_aborted_requests. + # When a client disconnects, AsyncStream._cancel() is called. + # We passed a callback to AsyncStream(), which appends to + # self.client_aborted_requests. - As a result, if any requests are canceled from the user side - the request_id will show up in self.client_aborted_requests. - """ + # As a result, if any requests are canceled from the user side + # the request_id will show up in self.client_aborted_requests. + # """ - # Avoid streams having circular ref to parent AsyncLLM object. - if not self.client_aborted_requests: - return - reqs_to_abort = self.client_aborted_requests.copy() - self.client_aborted_requests.clear() + # # Avoid streams having circular ref to parent AsyncLLM object. + # if not self.client_aborted_requests: + # return + # reqs_to_abort = self.client_aborted_requests.copy() + # self.client_aborted_requests.clear() - # Remove from Detokenizer. - self.detokenizer.abort_requests(reqs_to_abort) + # # Remove from Detokenizer. + # self.detokenizer.abort_requests(reqs_to_abort) - # Remove from RequestStreams. - for request_id in reqs_to_abort: - if self.log_requests: - logger.info("User-cancelled request %s.", request_id) - self._finish_stream(request_id) + # # Remove from RequestStreams. + # for request_id in reqs_to_abort: + # if self.log_requests: + # logger.info("User-cancelled request %s.", request_id) + # self._finish_stream(request_id) - # Remove from EngineCore. - await self.engine_core.abort_requests_async(reqs_to_abort) + # # Remove from EngineCore. + # await self.engine_core.abort_requests_async(reqs_to_abort) async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" @@ -324,14 +333,22 @@ async def _run_output_handler(self): detokenizer_outputs = ( await self.detokenizer.get_output_async()).outputs - for out in detokenizer_outputs: - if out.request_id not in self.rid_to_state: - raise RuntimeError(f"{out.request_id} " + for detok_out in detokenizer_outputs: + if detok_out.request_id not in self.rid_to_state: + raise RuntimeError(f"{detok_out.request_id} " "not in RequestStates") + state = self.rid_to_state[detok_out.request_id] + + out = RequestOutput.from_detok( + state.prompt, + state.prompt_token_ids, + detok_out, + ) + # Update the RequestState and alert generate() that there # is a RequestOutput ready to return to the user. - state = self.rid_to_state[out.request_id] + state.out_list.append(out) state.event.set() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4c0745060bc02..4327e48b86ef1 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -232,7 +232,7 @@ async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: msg = (request_type.value, self.encoder.encode(request)) - await self.input_socket.send_multipart(msg, copy=False) + await self.input_socket.send_multipart(msg, copy=False, flag=zmq.NOBLOCK) async def add_request_async(self, request: EngineCoreRequest) -> None: await self._send_input(EngineCoreRequestType.ADD, request) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index e9a5a00d696b1..d56f9646fdda3 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -319,6 +319,7 @@ def step( detokenizer_outputs.outputs.append( DetokenizerOutput( request_id=request_id, + token_ids=request_output.outputs[0].token_ids, text=request_output.outputs[0].text, finished=request_output.finished, ) From dfa452658111a0341d8b224923b1cc8b6d811016 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 19 Dec 2024 00:09:53 +0000 Subject: [PATCH 033/132] stahs --- vllm/v1/engine/async_llm.py | 16 +++++++++------- vllm/v1/engine/core.py | 5 +++++ vllm/v1/engine/core_client.py | 2 +- vllm/v1/engine/detokenizer.py | 20 +++++++++++++------- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 8148d28de1bf1..a0ed047f630f1 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -188,9 +188,6 @@ async def add_request( ) -> RequestState: """Add new request to the AsyncLLM.""" - - - # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. _, engine_core_req = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, @@ -275,15 +272,16 @@ async def generate( # a delta text. This way we do "dynamic chunked streaming", such # that the API client does not fall behind the EngineCor, # which happens at high QPS otherwise. - out = state.out_list[-1] - if len(state.out_list) > 2 and not self.warned: - logger.info(f"{len(state.out_list)=}") - self.warned = True except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. logger.debug("Timeout waiting for %s", request_id) continue + + out = state.out_list[-1] + if len(state.out_list) > 2: + logger.info(f"{len(state.out_list)=}") + self.warned = True state.out_list = [] if out.finished: @@ -326,9 +324,13 @@ async def generate( async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + epoch = 0 try: while True: + logger.info(f"EPOCH: {epoch}") + epoch += 1 + # 1) Pull outputs from the Detokenizer. detokenizer_outputs = ( await self.detokenizer.get_output_async()).outputs diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 2eff8a8adb230..8616141c4cec7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -243,9 +243,14 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the EngineCore.""" + + epoch = 0 # Loop until process is sent a SIGINT or SIGTERM while True: + logger.info(f"EPOCH: {epoch}") + epoch += 1 + # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): while True: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4327e48b86ef1..4c0745060bc02 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -232,7 +232,7 @@ async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: msg = (request_type.value, self.encoder.encode(request)) - await self.input_socket.send_multipart(msg, copy=False, flag=zmq.NOBLOCK) + await self.input_socket.send_multipart(msg, copy=False) async def add_request_async(self, request: EngineCoreRequest) -> None: await self._send_input(EngineCoreRequestType.ADD, request) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index d56f9646fdda3..141be5e81f589 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -231,7 +231,7 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str: return self.output_text[last_offset:length] return "" - +import time class Detokenizer: def __init__(self, @@ -456,6 +456,8 @@ def run_busy_loop(self): with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as engine_core_outputs_socket, zmq_socket_ctx(self.input_path, zmq.PULL) as input_socket, zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket): + + epoch = 0 # TODO: avoid poll by having both EngineCore # and AsyncLLM send to the same socket (unclear why this @@ -466,6 +468,9 @@ def run_busy_loop(self): # idx = 0 while True: + logger.info(f"EPOCH: {epoch}") + epoch += 1 + socks = dict(poller.poll()) # Handle NewRequest. @@ -480,14 +485,13 @@ def run_busy_loop(self): engine_core_outputs = decoder_out.decode(frame.buffer).outputs detokenizer_outputs, _ = self.step(engine_core_outputs) msg = encoder.encode(detokenizer_outputs) - output_socket.send_multipart((msg, ), copy=False) + # output_socket.send_multipart((msg, ), copy=False) + output_socket.send(msg) except Exception as e: logger.error(e) raise e -import time - class DetokenizerClient: def __init__(self, *args, engine_core_outputs_path: str, **kwargs): @@ -498,7 +502,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): self.decoder = msgspec.msgpack.Decoder(DetokenizerOutputs) # ZMQ setup. - self.ctx = zmq.asyncio.Context(io_threads=2) + self.ctx = zmq.asyncio.Context(4) # Get input (DetokenizerRequest) to Detokenizer. input_path = get_open_zmq_ipc_path() @@ -542,5 +546,7 @@ async def add_request_async(self, request: DetokenizerRequest): async def get_output_async(self) -> DetokenizerOutputs: """Get RequestOutputs, RequestsToAbort from Detokenizer.""" - (frame, ) = await self.output_socket.recv_multipart(copy=False) - return self.decoder.decode(frame.buffer) + # (frame, ) = await self.output_socket.recv_multipart(copy=False) + # return self.decoder.decode(frame.buffer) + msg = await self.output_socket.recv() + return self.decoder.decode(msg) From e3d6b0e3cfc6ed3572c8a99ca8e5ed0dbdffe253 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 19 Dec 2024 04:22:58 +0000 Subject: [PATCH 034/132] stash --- vllm/v1/engine/__init__.py | 2 +- vllm/v1/engine/async_llm.py | 4 ++-- vllm/v1/engine/core.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 860662602160a..f81869a46b837 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -89,7 +89,7 @@ class EngineCoreOutput( request_id: str prompt_token_ids: List[int] - prompt: str + prompt: Optional[str] new_token_ids: List[int] finished: bool finish_reason: Optional[str] = None diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a0ed047f630f1..e95ee059853b6 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -328,8 +328,8 @@ async def _run_output_handler(self): try: while True: - logger.info(f"EPOCH: {epoch}") - epoch += 1 + # logger.info(f"EPOCH: {epoch}") + # epoch += 1 # 1) Pull outputs from the Detokenizer. detokenizer_outputs = ( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 8616141c4cec7..2957039c8fa19 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -248,8 +248,8 @@ def run_busy_loop(self): # Loop until process is sent a SIGINT or SIGTERM while True: - logger.info(f"EPOCH: {epoch}") - epoch += 1 + # logger.info(f"EPOCH: {epoch}") + # epoch += 1 # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): From 2c0a7939484892b6ab76ddf58c71291db75ab9d4 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 20 Dec 2024 03:28:21 +0000 Subject: [PATCH 035/132] yay --- benchmarks/benchmark_throughput.py | 3 + vllm/v1/engine/async_llm.py | 93 ++++++++++++++++-------------- vllm/v1/engine/core_client.py | 1 + vllm/v1/engine/detokenizer.py | 17 +++--- 4 files changed, 63 insertions(+), 51 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 0926cec29a907..12ee9798d827e 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -207,6 +207,9 @@ async def run_vllm_async( # async for i, res in all_gens: # pass + from aiodebug import log_slow_callbacks + loop = asyncio.get_event_loop() + log_slow_callbacks.enable(0.05) await asyncio.gather(*tasks) end = time.perf_counter() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index e95ee059853b6..88de4d114446a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -115,6 +115,7 @@ def __init__( # self.output_handler: Optional[asyncio.Task] = None self.to_create_loop = True + self.epoch = 0 def __del__(self): self.shutdown() @@ -246,9 +247,17 @@ async def generate( # to handle startup failure gracefully in the OpenAI server. # if self.output_handler is None: if self.to_create_loop: + + import signal + def signal_handler(self, signum=None, frame=None): + logger.warning( + f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..." + ) + + self.to_create_loop = False loop = asyncio.get_event_loop() loop.create_task(self._run_output_handler()) - self.to_create_loop = False + loop.add_signal_handler(signal.SIGTERM, signal_handler) state = await self.add_request( request_id, @@ -275,13 +284,13 @@ async def generate( except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. - logger.debug("Timeout waiting for %s", request_id) + # logger.debug("Timeout waiting for %s", request_id) continue out = state.out_list[-1] if len(state.out_list) > 2: logger.info(f"{len(state.out_list)=}") - self.warned = True + state.out_list = [] if out.finished: @@ -324,46 +333,44 @@ async def generate( async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" - epoch = 0 - - try: - while True: - # logger.info(f"EPOCH: {epoch}") - # epoch += 1 - - # 1) Pull outputs from the Detokenizer. - detokenizer_outputs = ( - await self.detokenizer.get_output_async()).outputs - - for detok_out in detokenizer_outputs: - if detok_out.request_id not in self.rid_to_state: - raise RuntimeError(f"{detok_out.request_id} " - "not in RequestStates") - - state = self.rid_to_state[detok_out.request_id] - - out = RequestOutput.from_detok( - state.prompt, - state.prompt_token_ids, - detok_out, - ) - - # Update the RequestState and alert generate() that there - # is a RequestOutput ready to return to the user. - - state.out_list.append(out) - state.event.set() - - # 3) Abort any requests that finished due to stop strings. - # await self.engine_core.abort_requests_async(reqs_to_abort) - - # 4) Abort any requests due to client cancellations. - # TODO: send back to detokenizer if this fails. - # await self._process_cancellations() - - except Exception as e: - logger.error(e) - raise e + # epoch = 0 + + while True: + # logger.info(f"EPOCH: {epoch}") + # self.warned = False + # if self.epoch % 10 == 0: + # logger.info(f"\n{self.epoch=}\n") + # self.epoch += 1 + + # 1) Pull outputs from the Detokenizer. + detokenizer_outputs = ( + await self.detokenizer.get_output_async()).outputs + + for detok_out in detokenizer_outputs: + if detok_out.request_id not in self.rid_to_state: + raise RuntimeError(f"{detok_out.request_id} " + "not in RequestStates") + + state = self.rid_to_state[detok_out.request_id] + + out = RequestOutput.from_detok( + state.prompt, + state.prompt_token_ids, + detok_out, + ) + + # Update the RequestState and alert generate() that there + # is a RequestOutput ready to return to the user. + + state.out_list.append(out) + state.event.set() + + # 3) Abort any requests that finished due to stop strings. + # await self.engine_core.abort_requests_async(reqs_to_abort) + + # 4) Abort any requests due to client cancellations. + # TODO: send back to detokenizer if this fails. + # await self._process_cancellations() async def abort(self, request_id: str) -> None: # Note: this is not used outside of testing. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4c0745060bc02..7559ca1af2a03 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -140,6 +140,7 @@ def __init__( # ZMQ setup. if asyncio_mode: + print("HERE HERE HERE") self.ctx = zmq.asyncio.Context(io_threads=2) else: self.ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 141be5e81f589..a355be676d5e5 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -468,8 +468,8 @@ def run_busy_loop(self): # idx = 0 while True: - logger.info(f"EPOCH: {epoch}") - epoch += 1 + # logger.info(f"EPOCH: {epoch}") + # epoch += 1 socks = dict(poller.poll()) @@ -484,9 +484,10 @@ def run_busy_loop(self): (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False) engine_core_outputs = decoder_out.decode(frame.buffer).outputs detokenizer_outputs, _ = self.step(engine_core_outputs) - msg = encoder.encode(detokenizer_outputs) - # output_socket.send_multipart((msg, ), copy=False) - output_socket.send(msg) + # msg = encoder.encode(detokenizer_outputs) + # # output_socket.send_multipart((msg, ), copy=False) + # output_socket.send(msg) + output_socket.send_pyobj(detokenizer_outputs) except Exception as e: logger.error(e) @@ -502,7 +503,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): self.decoder = msgspec.msgpack.Decoder(DetokenizerOutputs) # ZMQ setup. - self.ctx = zmq.asyncio.Context(4) + self.ctx = zmq.asyncio.Context(2) # Get input (DetokenizerRequest) to Detokenizer. input_path = get_open_zmq_ipc_path() @@ -548,5 +549,5 @@ async def get_output_async(self) -> DetokenizerOutputs: # (frame, ) = await self.output_socket.recv_multipart(copy=False) # return self.decoder.decode(frame.buffer) - msg = await self.output_socket.recv() - return self.decoder.decode(msg) + return await self.output_socket.recv_pyobj() + # return self.decoder.decode(msg) From ee791b21dddeed6deb39b68e3a65f2c6bcf54217 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 20 Dec 2024 03:44:26 +0000 Subject: [PATCH 036/132] no more preemptions --- vllm/v1/core/scheduler.py | 1 + vllm/v1/engine/async_llm.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 23dc4ef298fea..c83c931f75fea 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -140,6 +140,7 @@ def schedule(self) -> "SchedulerOutput": preempted_req.status = RequestStatus.PREEMPTED preempted_req.num_computed_tokens = 0 + logger.info(f"Preempted: {preempted_req.request_id}") self.waiting.appendleft(preempted_req) preempted_reqs.append(preempted_req) if preempted_req == request: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 88de4d114446a..ba48c7d6f1761 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -288,9 +288,8 @@ def signal_handler(self, signum=None, frame=None): continue out = state.out_list[-1] - if len(state.out_list) > 2: + if len(state.out_list) > 1: logger.info(f"{len(state.out_list)=}") - state.out_list = [] if out.finished: From 37135028988d085dc3af964f6c097707f51670fe Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 17:13:33 +0000 Subject: [PATCH 037/132] stash current state of async llm --- vllm/entrypoints/openai/api_server.py | 6 +++- vllm/entrypoints/openai/serving_completion.py | 14 ++++++++ vllm/v1/engine/async_llm.py | 30 +++++++---------- vllm/v1/engine/core.py | 7 ++-- vllm/v1/engine/detokenizer.py | 32 +++++++------------ 5 files changed, 44 insertions(+), 45 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 14e3a34ce141c..f301ada394000 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -374,12 +374,15 @@ async def create_chat_completion(request: ChatCompletionRequest, @router.post("/v1/completions") async def create_completion(request: CompletionRequest, raw_request: Request): + raw_request.app.count += 1 + should_profile = raw_request.app.count == 500 handler = completion(raw_request) if handler is None: return base(raw_request).create_error_response( message="The model does not support Completions API") - generator = await handler.create_completion(request, raw_request) + generator = await handler.create_completion(request, raw_request, + should_profile=should_profile) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -494,6 +497,7 @@ def build_app(args: Namespace) -> FastAPI: app = FastAPI(lifespan=lifespan) app.include_router(router) app.root_path = args.root_path + app.count = 0 mount_metrics(app) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 3b0bc2fa897c6..46c820fb5a794 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -61,6 +61,7 @@ async def create_completion( self, request: CompletionRequest, raw_request: Request, + should_profile: bool=False ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]: """Completion API similar to OpenAI's API. @@ -188,6 +189,7 @@ async def create_completion( tokenizer=tokenizer, request_metadata=request_metadata, output_kind=sampling_params.output_kind, + should_profile=should_profile, ) # Non-streaming response @@ -247,6 +249,7 @@ async def completion_stream_generator( tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, output_kind: RequestOutputKind, + should_profile: bool = False, ) -> AsyncGenerator[str, None]: """ In V0, we use RequestOutputType.DELTA and each RequestOutput @@ -268,6 +271,12 @@ async def completion_stream_generator( such that the API server falls behind, we dynamically fall back to streaming chunks of tokens. """ + if should_profile: + from pyinstrument import Profiler + print("STARTING PROFILER") + profiler = Profiler(async_mode="disabled") + profiler.start() + assert (output_kind == RequestOutputKind.CUMULATIVE or output_kind == RequestOutputKind.DELTA) @@ -293,6 +302,10 @@ async def completion_stream_generator( prompt_logprobs = res.prompt_logprobs prompt_text = res.prompt + if res.finished and should_profile: + profiler.stop() + profiler.write_html("task-disabled.html") + # Prompt details are excluded from later streamed outputs if res.prompt_token_ids is not None: num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids) @@ -419,6 +432,7 @@ async def completion_stream_generator( yield f"data: {data}\n\n" yield "data: [DONE]\n\n" + def request_output_to_completion_response( self, final_res_batch: List[RequestOutput], diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ba48c7d6f1761..09ee89e645a44 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -281,16 +281,15 @@ def signal_handler(self, signum=None, frame=None): # a delta text. This way we do "dynamic chunked streaming", such # that the API client does not fall behind the EngineCor, # which happens at high QPS otherwise. + out = state.out_list[-1] + if len(state.out_list) > 1: + logger.info(f"{len(state.out_list)=}") except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. # logger.debug("Timeout waiting for %s", request_id) continue - out = state.out_list[-1] - if len(state.out_list) > 1: - logger.info(f"{len(state.out_list)=}") - state.out_list = [] if out.finished: del self.rid_to_state[request_id] @@ -332,31 +331,24 @@ def signal_handler(self, signum=None, frame=None): async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" - # epoch = 0 + epoch = 0 while True: - # logger.info(f"EPOCH: {epoch}") + logger.info(f"EPOCH: {epoch}") + epoch += 1 # self.warned = False # if self.epoch % 10 == 0: # logger.info(f"\n{self.epoch=}\n") - # self.epoch += 1 # 1) Pull outputs from the Detokenizer. - detokenizer_outputs = ( - await self.detokenizer.get_output_async()).outputs + outputs = await self.detokenizer.output_socket.recv_pyobj() - for detok_out in detokenizer_outputs: - if detok_out.request_id not in self.rid_to_state: - raise RuntimeError(f"{detok_out.request_id} " + for out in outputs: + if out.request_id not in self.rid_to_state: + raise RuntimeError(f"{out.request_id} " "not in RequestStates") - state = self.rid_to_state[detok_out.request_id] - - out = RequestOutput.from_detok( - state.prompt, - state.prompt_token_ids, - detok_out, - ) + state = self.rid_to_state[out.request_id] # Update the RequestState and alert generate() that there # is a RequestOutput ready to return to the user. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 2957039c8fa19..f8aef10908514 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -244,12 +244,11 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the EngineCore.""" - epoch = 0 - # Loop until process is sent a SIGINT or SIGTERM + epoch = 0 while True: - # logger.info(f"EPOCH: {epoch}") - # epoch += 1 + logger.info(f"EPOCH: {epoch}") + epoch += 1 # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index a355be676d5e5..8db857bca9b72 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -293,9 +293,9 @@ def step( ) -> DetokenizerOutputs: """Update state and request the RequestOutputs to the LLMEngine.""" - # request_outputs: List[RequestOutput] = [] + request_outputs: List[RequestOutput] = [] # requests_to_abort: List[str] = [] - detokenizer_outputs = DetokenizerOutputs(outputs=[]) + # detokenizer_outputs = DetokenizerOutputs(outputs=[]) for engine_core_output in encore_core_outputs: request_id = engine_core_output.request_id @@ -316,16 +316,8 @@ def step( ) if request_output is not None: - detokenizer_outputs.outputs.append( - DetokenizerOutput( - request_id=request_id, - token_ids=request_output.outputs[0].token_ids, - text=request_output.outputs[0].text, - finished=request_output.finished, - ) - ) - # # Add to RequestOutputs list. - # request_outputs.append(request_output) + # Add to RequestOutputs list. + request_outputs.append(request_output) # # Free completed requests. # if request_output.finished: @@ -338,7 +330,7 @@ def step( # Return to EngineClient. # return request_outputs, requests_to_abort - return detokenizer_outputs, [] + return request_outputs, [] class DetokenizerProc(Detokenizer): """ZMQ-wrapper for running Detokenizer in background process.""" @@ -456,8 +448,6 @@ def run_busy_loop(self): with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as engine_core_outputs_socket, zmq_socket_ctx(self.input_path, zmq.PULL) as input_socket, zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket): - - epoch = 0 # TODO: avoid poll by having both EngineCore # and AsyncLLM send to the same socket (unclear why this @@ -466,10 +456,10 @@ def run_busy_loop(self): poller.register(engine_core_outputs_socket, zmq.POLLIN) poller.register(input_socket, zmq.POLLIN) - # idx = 0 + epoch = 0 while True: - # logger.info(f"EPOCH: {epoch}") - # epoch += 1 + logger.info(f"EPOCH: {epoch}") + epoch += 1 socks = dict(poller.poll()) @@ -483,11 +473,11 @@ def run_busy_loop(self): if engine_core_outputs_socket in socks: (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False) engine_core_outputs = decoder_out.decode(frame.buffer).outputs - detokenizer_outputs, _ = self.step(engine_core_outputs) + request_outputs, _ = self.step(engine_core_outputs) # msg = encoder.encode(detokenizer_outputs) # # output_socket.send_multipart((msg, ), copy=False) # output_socket.send(msg) - output_socket.send_pyobj(detokenizer_outputs) + output_socket.send_pyobj(request_outputs) except Exception as e: logger.error(e) @@ -544,7 +534,7 @@ async def add_request_async(self, request: DetokenizerRequest): msg = (self.encoder.encode(request), ) await self.input_socket.send_multipart(msg, copy=False) - async def get_output_async(self) -> DetokenizerOutputs: + async def get_output_async(self) -> List[RequestOutput]: """Get RequestOutputs, RequestsToAbort from Detokenizer.""" # (frame, ) = await self.output_socket.recv_multipart(copy=False) From bcd45be052c88e965486f8eeed27537bbc098234 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 18:30:34 +0000 Subject: [PATCH 038/132] stash profile' --- vllm/entrypoints/openai/api_server.py | 445 +++++++++--------- vllm/entrypoints/openai/serving_completion.py | 7 +- 2 files changed, 228 insertions(+), 224 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f301ada394000..fe312c3f96cdb 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -245,7 +245,7 @@ def _cleanup_ipc_path(): multiprocess.mark_process_dead(engine_process.pid) -router = APIRouter() +# router = APIRouter() def mount_metrics(app: FastAPI): @@ -303,253 +303,254 @@ def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client -@router.get("/health") -async def health(raw_request: Request) -> Response: - """Health check.""" - await engine_client(raw_request).check_health() - return Response(status_code=200) - - -@router.post("/tokenize") -async def tokenize(request: TokenizeRequest, raw_request: Request): - handler = tokenization(raw_request) - - generator = await handler.create_tokenize(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, TokenizeResponse): - return JSONResponse(content=generator.model_dump()) - - assert_never(generator) - - -@router.post("/detokenize") -async def detokenize(request: DetokenizeRequest, raw_request: Request): - handler = tokenization(raw_request) - - generator = await handler.create_detokenize(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, DetokenizeResponse): - return JSONResponse(content=generator.model_dump()) - - assert_never(generator) - - -@router.get("/v1/models") -async def show_available_models(raw_request: Request): - handler = base(raw_request) - - models = await handler.show_available_models() - return JSONResponse(content=models.model_dump()) - - -@router.get("/version") -async def show_version(): - ver = {"version": VLLM_VERSION} - return JSONResponse(content=ver) - - -@router.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest, - raw_request: Request): - handler = chat(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Chat Completions API") - - generator = await handler.create_chat_completion(request, raw_request) - - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - - elif isinstance(generator, ChatCompletionResponse): - return JSONResponse(content=generator.model_dump()) - - return StreamingResponse(content=generator, media_type="text/event-stream") - - -@router.post("/v1/completions") -async def create_completion(request: CompletionRequest, raw_request: Request): - raw_request.app.count += 1 - should_profile = raw_request.app.count == 500 - handler = completion(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Completions API") - - generator = await handler.create_completion(request, raw_request, - should_profile=should_profile) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, CompletionResponse): - return JSONResponse(content=generator.model_dump()) - - return StreamingResponse(content=generator, media_type="text/event-stream") +def build_app(args: Namespace) -> FastAPI: + if args.disable_fastapi_docs: + app = FastAPI(openapi_url=None, + docs_url=None, + redoc_url=None, + lifespan=lifespan) + else: + app = FastAPI(lifespan=lifespan) + # app.include_router(router) + app.root_path = args.root_path + app.count = 0 + mount_metrics(app) -@router.post("/v1/embeddings") -async def create_embedding(request: EmbeddingRequest, raw_request: Request): - handler = embedding(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Embeddings API") + app.add_middleware( + CORSMiddleware, + allow_origins=args.allowed_origins, + allow_credentials=args.allow_credentials, + allow_methods=args.allowed_methods, + allow_headers=args.allowed_headers, + ) - generator = await handler.create_embedding(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, EmbeddingResponse): - return JSONResponse(content=generator.model_dump()) + @app.get("/health") + async def health(raw_request: Request) -> Response: + """Health check.""" + await engine_client(raw_request).check_health() + return Response(status_code=200) - assert_never(generator) + @app.post("/tokenize") + async def tokenize(request: TokenizeRequest, raw_request: Request): + handler = tokenization(raw_request) -@router.post("/score") -async def create_score(request: ScoreRequest, raw_request: Request): - handler = score(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Score API") + generator = await handler.create_tokenize(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, TokenizeResponse): + return JSONResponse(content=generator.model_dump()) - generator = await handler.create_score(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, ScoreResponse): - return JSONResponse(content=generator.model_dump()) + assert_never(generator) - assert_never(generator) + @app.post("/detokenize") + async def detokenize(request: DetokenizeRequest, raw_request: Request): + handler = tokenization(raw_request) -@router.post("/v1/score") -async def create_score_v1(request: ScoreRequest, raw_request: Request): - logger.warning( - "To indicate that Score API is not part of standard OpenAI API, we " - "have moved it to `/score`. Please update your client accordingly.") + generator = await handler.create_detokenize(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, DetokenizeResponse): + return JSONResponse(content=generator.model_dump()) - return await create_score(request, raw_request) + assert_never(generator) -if envs.VLLM_TORCH_PROFILER_DIR: - logger.warning( - "Torch Profiler is enabled in the API server. This should ONLY be " - "used for local development!") + @app.get("/v1/models") + async def show_available_models(raw_request: Request): + handler = base(raw_request) - @router.post("/start_profile") - async def start_profile(raw_request: Request): - logger.info("Starting profiler...") - await engine_client(raw_request).start_profile() - logger.info("Profiler started.") - return Response(status_code=200) + models = await handler.show_available_models() + return JSONResponse(content=models.model_dump()) - @router.post("/stop_profile") - async def stop_profile(raw_request: Request): - logger.info("Stopping profiler...") - await engine_client(raw_request).stop_profile() - logger.info("Profiler stopped.") - return Response(status_code=200) + @app.get("/version") + async def show_version(): + ver = {"version": VLLM_VERSION} + return JSONResponse(content=ver) -if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: - logger.warning( - "Lora dynamic loading & unloading is enabled in the API server. " - "This should ONLY be used for local development!") - @router.post("/v1/load_lora_adapter") - async def load_lora_adapter(request: LoadLoraAdapterRequest, - raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + @app.post("/v1/chat/completions") + async def create_chat_completion(request: ChatCompletionRequest, + raw_request: Request): + handler = chat(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Chat Completions API") - return Response(status_code=200, content=response) + generator = await handler.create_chat_completion(request, raw_request) - @router.post("/v1/unload_lora_adapter") - async def unload_lora_adapter(request: UnloadLoraAdapterRequest, - raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) - return Response(status_code=200, content=response) + elif isinstance(generator, ChatCompletionResponse): + return JSONResponse(content=generator.model_dump()) + return StreamingResponse(content=generator, media_type="text/event-stream") -def build_app(args: Namespace) -> FastAPI: - if args.disable_fastapi_docs: - app = FastAPI(openapi_url=None, - docs_url=None, - redoc_url=None, - lifespan=lifespan) - else: - app = FastAPI(lifespan=lifespan) - app.include_router(router) - app.root_path = args.root_path - app.count = 0 - mount_metrics(app) + @app.post("/v1/completions") + async def create_completion(request: CompletionRequest, raw_request: Request): + raw_request.app.count += 1 + should_profile = raw_request.app.count == 500 + print(f"{should_profile=}") + handler = completion(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Completions API") - app.add_middleware( - CORSMiddleware, - allow_origins=args.allowed_origins, - allow_credentials=args.allow_credentials, - allow_methods=args.allowed_methods, - allow_headers=args.allowed_headers, - ) + generator = await handler.create_completion(request, raw_request, + should_profile=should_profile) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, CompletionResponse): + return JSONResponse(content=generator.model_dump()) - @app.exception_handler(RequestValidationError) - async def validation_exception_handler(_, exc): - err = ErrorResponse(message=str(exc), - type="BadRequestError", - code=HTTPStatus.BAD_REQUEST) - return JSONResponse(err.model_dump(), - status_code=HTTPStatus.BAD_REQUEST) - - if token := envs.VLLM_API_KEY or args.api_key: - - @app.middleware("http") - async def authentication(request: Request, call_next): - if request.method == "OPTIONS": - return await call_next(request) - url_path = request.url.path - if app.root_path and url_path.startswith(app.root_path): - url_path = url_path[len(app.root_path):] - if not url_path.startswith("/v1"): - return await call_next(request) - if request.headers.get("Authorization") != "Bearer " + token: - return JSONResponse(content={"error": "Unauthorized"}, - status_code=401) - return await call_next(request) - - @app.middleware("http") - async def add_request_id(request: Request, call_next): - request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex - response = await call_next(request) - response.headers["X-Request-Id"] = request_id - return response - - for middleware in args.middleware: - module_path, object_name = middleware.rsplit(".", 1) - imported = getattr(importlib.import_module(module_path), object_name) - if inspect.isclass(imported): - app.add_middleware(imported) - elif inspect.iscoroutinefunction(imported): - app.middleware("http")(imported) - else: - raise ValueError(f"Invalid middleware {middleware}. " - f"Must be a function or a class.") + return StreamingResponse(content=generator, media_type="text/event-stream") + + + @app.post("/v1/embeddings") + async def create_embedding(request: EmbeddingRequest, raw_request: Request): + handler = embedding(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") + + generator = await handler.create_embedding(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, EmbeddingResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +# @router.post("/score") +# async def create_score(request: ScoreRequest, raw_request: Request): +# handler = score(raw_request) +# if handler is None: +# return base(raw_request).create_error_response( +# message="The model does not support Score API") + +# generator = await handler.create_score(request, raw_request) +# if isinstance(generator, ErrorResponse): +# return JSONResponse(content=generator.model_dump(), +# status_code=generator.code) +# elif isinstance(generator, ScoreResponse): +# return JSONResponse(content=generator.model_dump()) + +# assert_never(generator) + + +# @router.post("/v1/score") +# async def create_score_v1(request: ScoreRequest, raw_request: Request): +# logger.warning( +# "To indicate that Score API is not part of standard OpenAI API, we " +# "have moved it to `/score`. Please update your client accordingly.") + +# return await create_score(request, raw_request) + + +# if envs.VLLM_TORCH_PROFILER_DIR: +# logger.warning( +# "Torch Profiler is enabled in the API server. This should ONLY be " +# "used for local development!") + +# @router.post("/start_profile") +# async def start_profile(raw_request: Request): +# logger.info("Starting profiler...") +# await engine_client(raw_request).start_profile() +# logger.info("Profiler started.") +# return Response(status_code=200) + +# @router.post("/stop_profile") +# async def stop_profile(raw_request: Request): +# logger.info("Stopping profiler...") +# await engine_client(raw_request).stop_profile() +# logger.info("Profiler stopped.") +# return Response(status_code=200) + + +# if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: +# logger.warning( +# "Lora dynamic loading & unloading is enabled in the API server. " +# "This should ONLY be used for local development!") + +# @router.post("/v1/load_lora_adapter") +# async def load_lora_adapter(request: LoadLoraAdapterRequest, +# raw_request: Request): +# for route in [chat, completion, embedding]: +# handler = route(raw_request) +# if handler is not None: +# response = await handler.load_lora_adapter(request) +# if isinstance(response, ErrorResponse): +# return JSONResponse(content=response.model_dump(), +# status_code=response.code) + +# return Response(status_code=200, content=response) + +# @router.post("/v1/unload_lora_adapter") +# async def unload_lora_adapter(request: UnloadLoraAdapterRequest, +# raw_request: Request): +# for route in [chat, completion, embedding]: +# handler = route(raw_request) +# if handler is not None: +# response = await handler.unload_lora_adapter(request) +# if isinstance(response, ErrorResponse): +# return JSONResponse(content=response.model_dump(), +# status_code=response.code) + +# return Response(status_code=200, content=response) + + # @app.exception_handler(RequestValidationError) + # async def validation_exception_handler(_, exc): + # err = ErrorResponse(message=str(exc), + # type="BadRequestError", + # code=HTTPStatus.BAD_REQUEST) + # return JSONResponse(err.model_dump(), + # status_code=HTTPStatus.BAD_REQUEST) + + # if token := envs.VLLM_API_KEY or args.api_key: + + # @app.middleware("http") + # async def authentication(request: Request, call_next): + # if request.method == "OPTIONS": + # return await call_next(request) + # url_path = request.url.path + # if app.root_path and url_path.startswith(app.root_path): + # url_path = url_path[len(app.root_path):] + # if not url_path.startswith("/v1"): + # return await call_next(request) + # if request.headers.get("Authorization") != "Bearer " + token: + # return JSONResponse(content={"error": "Unauthorized"}, + # status_code=401) + # return await call_next(request) + + # @app.middleware("http") + # async def add_request_id(request: Request, call_next): + # request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex + # response = await call_next(request) + # response.headers["X-Request-Id"] = request_id + # return response + + # print(f"{args.middleware=}") + # for middleware in args.middleware: + # module_path, object_name = middleware.rsplit(".", 1) + # imported = getattr(importlib.import_module(module_path), object_name) + # if inspect.isclass(imported): + # app.add_middleware(imported) + # elif inspect.iscoroutinefunction(imported): + # app.middleware("http")(imported) + # else: + # raise ValueError(f"Invalid middleware {middleware}. " + # f"Must be a function or a class.") return app diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 46c820fb5a794..a4f9f6b9af536 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -6,6 +6,7 @@ from fastapi import Request +from pyinstrument import Profiler from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger @@ -72,6 +73,7 @@ async def create_completion( - suffix (the language models we currently support do not support suffix) """ + error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret @@ -271,8 +273,9 @@ async def completion_stream_generator( such that the API server falls behind, we dynamically fall back to streaming chunks of tokens. """ + print(f"GENERATOR: {should_profile=}") + should_profile=False if should_profile: - from pyinstrument import Profiler print("STARTING PROFILER") profiler = Profiler(async_mode="disabled") profiler.start() @@ -304,7 +307,7 @@ async def completion_stream_generator( if res.finished and should_profile: profiler.stop() - profiler.write_html("task-disabled.html") + profiler.write_html("vllm-proxy.html") # Prompt details are excluded from later streamed outputs if res.prompt_token_ids is not None: From 2c067956128663e087d2cd68cb500f50007ada28 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 18:32:34 +0000 Subject: [PATCH 039/132] Revert "stash profile'" This reverts commit bcd45be052c88e965486f8eeed27537bbc098234. --- vllm/entrypoints/openai/api_server.py | 445 +++++++++--------- vllm/entrypoints/openai/serving_completion.py | 7 +- 2 files changed, 224 insertions(+), 228 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index fe312c3f96cdb..f301ada394000 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -245,7 +245,7 @@ def _cleanup_ipc_path(): multiprocess.mark_process_dead(engine_process.pid) -# router = APIRouter() +router = APIRouter() def mount_metrics(app: FastAPI): @@ -303,254 +303,253 @@ def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client -def build_app(args: Namespace) -> FastAPI: - if args.disable_fastapi_docs: - app = FastAPI(openapi_url=None, - docs_url=None, - redoc_url=None, - lifespan=lifespan) - else: - app = FastAPI(lifespan=lifespan) - # app.include_router(router) - app.root_path = args.root_path - app.count = 0 +@router.get("/health") +async def health(raw_request: Request) -> Response: + """Health check.""" + await engine_client(raw_request).check_health() + return Response(status_code=200) - mount_metrics(app) - app.add_middleware( - CORSMiddleware, - allow_origins=args.allowed_origins, - allow_credentials=args.allow_credentials, - allow_methods=args.allowed_methods, - allow_headers=args.allowed_headers, - ) +@router.post("/tokenize") +async def tokenize(request: TokenizeRequest, raw_request: Request): + handler = tokenization(raw_request) - @app.get("/health") - async def health(raw_request: Request) -> Response: - """Health check.""" - await engine_client(raw_request).check_health() - return Response(status_code=200) + generator = await handler.create_tokenize(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, TokenizeResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post("/detokenize") +async def detokenize(request: DetokenizeRequest, raw_request: Request): + handler = tokenization(raw_request) + + generator = await handler.create_detokenize(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, DetokenizeResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) - @app.post("/tokenize") - async def tokenize(request: TokenizeRequest, raw_request: Request): - handler = tokenization(raw_request) +@router.get("/v1/models") +async def show_available_models(raw_request: Request): + handler = base(raw_request) - generator = await handler.create_tokenize(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, TokenizeResponse): - return JSONResponse(content=generator.model_dump()) + models = await handler.show_available_models() + return JSONResponse(content=models.model_dump()) - assert_never(generator) +@router.get("/version") +async def show_version(): + ver = {"version": VLLM_VERSION} + return JSONResponse(content=ver) - @app.post("/detokenize") - async def detokenize(request: DetokenizeRequest, raw_request: Request): - handler = tokenization(raw_request) - generator = await handler.create_detokenize(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, DetokenizeResponse): - return JSONResponse(content=generator.model_dump()) +@router.post("/v1/chat/completions") +async def create_chat_completion(request: ChatCompletionRequest, + raw_request: Request): + handler = chat(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Chat Completions API") - assert_never(generator) + generator = await handler.create_chat_completion(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) - @app.get("/v1/models") - async def show_available_models(raw_request: Request): - handler = base(raw_request) + elif isinstance(generator, ChatCompletionResponse): + return JSONResponse(content=generator.model_dump()) - models = await handler.show_available_models() - return JSONResponse(content=models.model_dump()) + return StreamingResponse(content=generator, media_type="text/event-stream") - @app.get("/version") - async def show_version(): - ver = {"version": VLLM_VERSION} - return JSONResponse(content=ver) +@router.post("/v1/completions") +async def create_completion(request: CompletionRequest, raw_request: Request): + raw_request.app.count += 1 + should_profile = raw_request.app.count == 500 + handler = completion(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Completions API") + generator = await handler.create_completion(request, raw_request, + should_profile=should_profile) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, CompletionResponse): + return JSONResponse(content=generator.model_dump()) - @app.post("/v1/chat/completions") - async def create_chat_completion(request: ChatCompletionRequest, - raw_request: Request): - handler = chat(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Chat Completions API") + return StreamingResponse(content=generator, media_type="text/event-stream") - generator = await handler.create_chat_completion(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) +@router.post("/v1/embeddings") +async def create_embedding(request: EmbeddingRequest, raw_request: Request): + handler = embedding(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") - elif isinstance(generator, ChatCompletionResponse): - return JSONResponse(content=generator.model_dump()) + generator = await handler.create_embedding(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, EmbeddingResponse): + return JSONResponse(content=generator.model_dump()) - return StreamingResponse(content=generator, media_type="text/event-stream") + assert_never(generator) - @app.post("/v1/completions") - async def create_completion(request: CompletionRequest, raw_request: Request): - raw_request.app.count += 1 - should_profile = raw_request.app.count == 500 - print(f"{should_profile=}") - handler = completion(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Completions API") +@router.post("/score") +async def create_score(request: ScoreRequest, raw_request: Request): + handler = score(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Score API") - generator = await handler.create_completion(request, raw_request, - should_profile=should_profile) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, CompletionResponse): - return JSONResponse(content=generator.model_dump()) + generator = await handler.create_score(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, ScoreResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post("/v1/score") +async def create_score_v1(request: ScoreRequest, raw_request: Request): + logger.warning( + "To indicate that Score API is not part of standard OpenAI API, we " + "have moved it to `/score`. Please update your client accordingly.") + + return await create_score(request, raw_request) + + +if envs.VLLM_TORCH_PROFILER_DIR: + logger.warning( + "Torch Profiler is enabled in the API server. This should ONLY be " + "used for local development!") + + @router.post("/start_profile") + async def start_profile(raw_request: Request): + logger.info("Starting profiler...") + await engine_client(raw_request).start_profile() + logger.info("Profiler started.") + return Response(status_code=200) - return StreamingResponse(content=generator, media_type="text/event-stream") - - - @app.post("/v1/embeddings") - async def create_embedding(request: EmbeddingRequest, raw_request: Request): - handler = embedding(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Embeddings API") - - generator = await handler.create_embedding(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - elif isinstance(generator, EmbeddingResponse): - return JSONResponse(content=generator.model_dump()) - - assert_never(generator) - - -# @router.post("/score") -# async def create_score(request: ScoreRequest, raw_request: Request): -# handler = score(raw_request) -# if handler is None: -# return base(raw_request).create_error_response( -# message="The model does not support Score API") - -# generator = await handler.create_score(request, raw_request) -# if isinstance(generator, ErrorResponse): -# return JSONResponse(content=generator.model_dump(), -# status_code=generator.code) -# elif isinstance(generator, ScoreResponse): -# return JSONResponse(content=generator.model_dump()) - -# assert_never(generator) - - -# @router.post("/v1/score") -# async def create_score_v1(request: ScoreRequest, raw_request: Request): -# logger.warning( -# "To indicate that Score API is not part of standard OpenAI API, we " -# "have moved it to `/score`. Please update your client accordingly.") - -# return await create_score(request, raw_request) - - -# if envs.VLLM_TORCH_PROFILER_DIR: -# logger.warning( -# "Torch Profiler is enabled in the API server. This should ONLY be " -# "used for local development!") - -# @router.post("/start_profile") -# async def start_profile(raw_request: Request): -# logger.info("Starting profiler...") -# await engine_client(raw_request).start_profile() -# logger.info("Profiler started.") -# return Response(status_code=200) - -# @router.post("/stop_profile") -# async def stop_profile(raw_request: Request): -# logger.info("Stopping profiler...") -# await engine_client(raw_request).stop_profile() -# logger.info("Profiler stopped.") -# return Response(status_code=200) - - -# if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: -# logger.warning( -# "Lora dynamic loading & unloading is enabled in the API server. " -# "This should ONLY be used for local development!") - -# @router.post("/v1/load_lora_adapter") -# async def load_lora_adapter(request: LoadLoraAdapterRequest, -# raw_request: Request): -# for route in [chat, completion, embedding]: -# handler = route(raw_request) -# if handler is not None: -# response = await handler.load_lora_adapter(request) -# if isinstance(response, ErrorResponse): -# return JSONResponse(content=response.model_dump(), -# status_code=response.code) - -# return Response(status_code=200, content=response) - -# @router.post("/v1/unload_lora_adapter") -# async def unload_lora_adapter(request: UnloadLoraAdapterRequest, -# raw_request: Request): -# for route in [chat, completion, embedding]: -# handler = route(raw_request) -# if handler is not None: -# response = await handler.unload_lora_adapter(request) -# if isinstance(response, ErrorResponse): -# return JSONResponse(content=response.model_dump(), -# status_code=response.code) - -# return Response(status_code=200, content=response) - - # @app.exception_handler(RequestValidationError) - # async def validation_exception_handler(_, exc): - # err = ErrorResponse(message=str(exc), - # type="BadRequestError", - # code=HTTPStatus.BAD_REQUEST) - # return JSONResponse(err.model_dump(), - # status_code=HTTPStatus.BAD_REQUEST) - - # if token := envs.VLLM_API_KEY or args.api_key: - - # @app.middleware("http") - # async def authentication(request: Request, call_next): - # if request.method == "OPTIONS": - # return await call_next(request) - # url_path = request.url.path - # if app.root_path and url_path.startswith(app.root_path): - # url_path = url_path[len(app.root_path):] - # if not url_path.startswith("/v1"): - # return await call_next(request) - # if request.headers.get("Authorization") != "Bearer " + token: - # return JSONResponse(content={"error": "Unauthorized"}, - # status_code=401) - # return await call_next(request) - - # @app.middleware("http") - # async def add_request_id(request: Request, call_next): - # request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex - # response = await call_next(request) - # response.headers["X-Request-Id"] = request_id - # return response - - # print(f"{args.middleware=}") - # for middleware in args.middleware: - # module_path, object_name = middleware.rsplit(".", 1) - # imported = getattr(importlib.import_module(module_path), object_name) - # if inspect.isclass(imported): - # app.add_middleware(imported) - # elif inspect.iscoroutinefunction(imported): - # app.middleware("http")(imported) - # else: - # raise ValueError(f"Invalid middleware {middleware}. " - # f"Must be a function or a class.") + @router.post("/stop_profile") + async def stop_profile(raw_request: Request): + logger.info("Stopping profiler...") + await engine_client(raw_request).stop_profile() + logger.info("Profiler stopped.") + return Response(status_code=200) + + +if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: + logger.warning( + "Lora dynamic loading & unloading is enabled in the API server. " + "This should ONLY be used for local development!") + + @router.post("/v1/load_lora_adapter") + async def load_lora_adapter(request: LoadLoraAdapterRequest, + raw_request: Request): + for route in [chat, completion, embedding]: + handler = route(raw_request) + if handler is not None: + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + return Response(status_code=200, content=response) + + @router.post("/v1/unload_lora_adapter") + async def unload_lora_adapter(request: UnloadLoraAdapterRequest, + raw_request: Request): + for route in [chat, completion, embedding]: + handler = route(raw_request) + if handler is not None: + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + return Response(status_code=200, content=response) + + +def build_app(args: Namespace) -> FastAPI: + if args.disable_fastapi_docs: + app = FastAPI(openapi_url=None, + docs_url=None, + redoc_url=None, + lifespan=lifespan) + else: + app = FastAPI(lifespan=lifespan) + app.include_router(router) + app.root_path = args.root_path + app.count = 0 + + mount_metrics(app) + + app.add_middleware( + CORSMiddleware, + allow_origins=args.allowed_origins, + allow_credentials=args.allow_credentials, + allow_methods=args.allowed_methods, + allow_headers=args.allowed_headers, + ) + + @app.exception_handler(RequestValidationError) + async def validation_exception_handler(_, exc): + err = ErrorResponse(message=str(exc), + type="BadRequestError", + code=HTTPStatus.BAD_REQUEST) + return JSONResponse(err.model_dump(), + status_code=HTTPStatus.BAD_REQUEST) + + if token := envs.VLLM_API_KEY or args.api_key: + + @app.middleware("http") + async def authentication(request: Request, call_next): + if request.method == "OPTIONS": + return await call_next(request) + url_path = request.url.path + if app.root_path and url_path.startswith(app.root_path): + url_path = url_path[len(app.root_path):] + if not url_path.startswith("/v1"): + return await call_next(request) + if request.headers.get("Authorization") != "Bearer " + token: + return JSONResponse(content={"error": "Unauthorized"}, + status_code=401) + return await call_next(request) + + @app.middleware("http") + async def add_request_id(request: Request, call_next): + request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex + response = await call_next(request) + response.headers["X-Request-Id"] = request_id + return response + + for middleware in args.middleware: + module_path, object_name = middleware.rsplit(".", 1) + imported = getattr(importlib.import_module(module_path), object_name) + if inspect.isclass(imported): + app.add_middleware(imported) + elif inspect.iscoroutinefunction(imported): + app.middleware("http")(imported) + else: + raise ValueError(f"Invalid middleware {middleware}. " + f"Must be a function or a class.") return app diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a4f9f6b9af536..46c820fb5a794 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -6,7 +6,6 @@ from fastapi import Request -from pyinstrument import Profiler from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger @@ -73,7 +72,6 @@ async def create_completion( - suffix (the language models we currently support do not support suffix) """ - error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret @@ -273,9 +271,8 @@ async def completion_stream_generator( such that the API server falls behind, we dynamically fall back to streaming chunks of tokens. """ - print(f"GENERATOR: {should_profile=}") - should_profile=False if should_profile: + from pyinstrument import Profiler print("STARTING PROFILER") profiler = Profiler(async_mode="disabled") profiler.start() @@ -307,7 +304,7 @@ async def completion_stream_generator( if res.finished and should_profile: profiler.stop() - profiler.write_html("vllm-proxy.html") + profiler.write_html("task-disabled.html") # Prompt details are excluded from later streamed outputs if res.prompt_token_ids is not None: From 4571da6bb9ddaa7c536ba99d492e101524183ae6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 21:34:14 +0000 Subject: [PATCH 040/132] updated --- vllm/entrypoints/openai/api_server.py | 17 ++-- vllm/entrypoints/openai/protocol.py | 9 +- vllm/entrypoints/openai/serving_completion.py | 12 --- vllm/v1/engine/async_llm.py | 89 ++++--------------- vllm/v1/engine/core.py | 4 +- vllm/v1/engine/detokenizer.py | 6 +- 6 files changed, 33 insertions(+), 104 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f301ada394000..c59c075e0ae2e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -374,15 +374,12 @@ async def create_chat_completion(request: ChatCompletionRequest, @router.post("/v1/completions") async def create_completion(request: CompletionRequest, raw_request: Request): - raw_request.app.count += 1 - should_profile = raw_request.app.count == 500 handler = completion(raw_request) if handler is None: return base(raw_request).create_error_response( message="The model does not support Completions API") - generator = await handler.create_completion(request, raw_request, - should_profile=should_profile) + generator = await handler.create_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -533,12 +530,12 @@ async def authentication(request: Request, call_next): status_code=401) return await call_next(request) - @app.middleware("http") - async def add_request_id(request: Request, call_next): - request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex - response = await call_next(request) - response.headers["X-Request-Id"] = request_id - return response + # @app.middleware("http") + # async def add_request_id(request: Request, call_next): + # request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex + # response = await call_next(request) + # response.headers["X-Request-Id"] = request_id + # return response for middleware in args.middleware: module_path, object_name = middleware.rsplit(".", 1) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index cfc02013dd8c5..8bae4dadbe625 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -38,11 +38,6 @@ assert _LONG_INFO.min == _MOCK_LONG_INFO.min assert _LONG_INFO.max == _MOCK_LONG_INFO.max -STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.DELTA -if VLLM_USE_V1: - STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.CUMULATIVE - - class OpenAIBaseModel(BaseModel): # OpenAI API does allow extra fields model_config = ConfigDict(extra="allow") @@ -427,7 +422,7 @@ def to_sampling_params( logits_processor_pattern), include_stop_str_in_output=self.include_stop_str_in_output, truncate_prompt_tokens=self.truncate_prompt_tokens, - output_kind=STREAM_SAMPLING_OUTPUT_KIND if self.stream \ + output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, guided_decoding=guided_decoding, logit_bias=self.logit_bias) @@ -742,7 +737,7 @@ def to_sampling_params( logits_processors=get_logits_processors(self.logits_processors, logits_processor_pattern), truncate_prompt_tokens=self.truncate_prompt_tokens, - output_kind=STREAM_SAMPLING_OUTPUT_KIND if self.stream \ + output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, guided_decoding=guided_decoding, logit_bias=self.logit_bias, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 46c820fb5a794..19c6f59cdd2aa 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -61,7 +61,6 @@ async def create_completion( self, request: CompletionRequest, raw_request: Request, - should_profile: bool=False ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]: """Completion API similar to OpenAI's API. @@ -189,7 +188,6 @@ async def create_completion( tokenizer=tokenizer, request_metadata=request_metadata, output_kind=sampling_params.output_kind, - should_profile=should_profile, ) # Non-streaming response @@ -249,7 +247,6 @@ async def completion_stream_generator( tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, output_kind: RequestOutputKind, - should_profile: bool = False, ) -> AsyncGenerator[str, None]: """ In V0, we use RequestOutputType.DELTA and each RequestOutput @@ -271,11 +268,6 @@ async def completion_stream_generator( such that the API server falls behind, we dynamically fall back to streaming chunks of tokens. """ - if should_profile: - from pyinstrument import Profiler - print("STARTING PROFILER") - profiler = Profiler(async_mode="disabled") - profiler.start() assert (output_kind == RequestOutputKind.CUMULATIVE or output_kind == RequestOutputKind.DELTA) @@ -302,10 +294,6 @@ async def completion_stream_generator( prompt_logprobs = res.prompt_logprobs prompt_text = res.prompt - if res.finished and should_profile: - profiler.stop() - profiler.write_html("task-disabled.html") - # Prompt details are excluded from later streamed outputs if res.prompt_token_ids is not None: num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 09ee89e645a44..d53649fb0ffb2 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -28,29 +28,6 @@ import uvloop asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) -@dataclass -class RequestState: - """ - RequestState manages concurrency between: - * the output_handler(), which pulls outputs from EngineCore - * the per-request generate(), which yields to the API server - - The output_handler adds new RequestOutputs to out_list and sets the - asyncio event, notifying the generate() that there is work to do. - - generate() waits on the asyncio event and yields the data from - out_list back to the caller generate() - """ - - prompt: str - prompt_token_ids: List[int] - event: asyncio.Event - out_list: List[RequestOutput] - - @classmethod - def new(cls, prompt, prompt_token_ids) -> "RequestState": - return cls(prompt, prompt_token_ids, asyncio.Event(), []) - class AsyncLLM(EngineClient): @@ -82,8 +59,8 @@ def __init__( lora_config=vllm_config.lora_config) self.tokenizer.ping() - # RequestId -> RequestState. - self.rid_to_state: Dict[str, RequestState] = {} + # RequestId -> OutputQueue. + self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {} # List of cancelled request ids to be aborted. self.client_aborted_requests: List[str] = [] @@ -186,7 +163,7 @@ async def add_request( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> RequestState: + ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. @@ -196,8 +173,7 @@ async def add_request( # 1) Add to RequestState tracker. The "event" is used to manage # concurrency between generate() and output_handler(). - self.rid_to_state[request_id] = RequestState.new(prompt, - engine_core_req.prompt_token_ids) + self.rid_to_queue[request_id] = asyncio.Queue() # 3) Add the DetokenizerRequest to Detokenizer. # TODO: sending these separately is a race condition. We should instead @@ -207,7 +183,7 @@ async def add_request( # 4) Add the EngineCoreRequest to EngineCore. await self.engine_core.add_request_async(engine_core_req) - return self.rid_to_state[request_id] + return self.rid_to_queue[request_id] # TODO: we should support multiple prompts in one call, as you # can do with LLM.generate. So that for multi-prompt completion @@ -238,10 +214,6 @@ async def generate( the latest RequestOutput back to the caller. """ - # DELTA streaming is not supported due to dynamic chunking. - assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE or - sampling_params.output_kind == RequestOutputKind.FINAL_ONLY) - # We start the output_handler on the first call to generate() so that # we can call __init__ before the event loop starts, which enables us # to handle startup failure gracefully in the OpenAI server. @@ -259,7 +231,7 @@ def signal_handler(self, signum=None, frame=None): loop.create_task(self._run_output_handler()) loop.add_signal_handler(signal.SIGTERM, signal_handler) - state = await self.add_request( + queue = await self.add_request( request_id, prompt, sampling_params, @@ -271,33 +243,22 @@ def signal_handler(self, signum=None, frame=None): while True: try: - await asyncio.wait_for(state.event.wait(), timeout=4) - - # NOTE(rob): out_list can have more than one item. However, - # in the streaming case, we use RequestOutputKind.CUMULATIVE, - # which has the full generated text output (not just the text - # corresponding to the last token). So, we can just send the - # last RequestOutput and the API Client handles converting into - # a delta text. This way we do "dynamic chunked streaming", such - # that the API client does not fall behind the EngineCor, - # which happens at high QPS otherwise. - out = state.out_list[-1] - if len(state.out_list) > 1: - logger.info(f"{len(state.out_list)=}") + out = await asyncio.wait_for(queue.get(), timeout=4) + if out.finished: + del self.rid_to_queue[request_id] + yield out + break + + yield out except asyncio.TimeoutError: # TODO(rob): do request cancellation checking here. # logger.debug("Timeout waiting for %s", request_id) continue - - state.out_list = [] - if out.finished: - del self.rid_to_state[request_id] - yield out - break - state.event.clear() - yield out + + + # async def _process_cancellations(self) -> None: # """ @@ -331,30 +292,18 @@ def signal_handler(self, signum=None, frame=None): async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" - epoch = 0 while True: - logger.info(f"EPOCH: {epoch}") - epoch += 1 - # self.warned = False - # if self.epoch % 10 == 0: - # logger.info(f"\n{self.epoch=}\n") # 1) Pull outputs from the Detokenizer. - outputs = await self.detokenizer.output_socket.recv_pyobj() + outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj() for out in outputs: - if out.request_id not in self.rid_to_state: + if out.request_id not in self.rid_to_queue: raise RuntimeError(f"{out.request_id} " "not in RequestStates") - state = self.rid_to_state[out.request_id] - - # Update the RequestState and alert generate() that there - # is a RequestOutput ready to return to the user. - - state.out_list.append(out) - state.event.set() + self.rid_to_queue[out.request_id].put_nowait(out) # 3) Abort any requests that finished due to stop strings. # await self.engine_core.abort_requests_async(reqs_to_abort) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f8aef10908514..a3f294bffa064 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -247,8 +247,8 @@ def run_busy_loop(self): # Loop until process is sent a SIGINT or SIGTERM epoch = 0 while True: - logger.info(f"EPOCH: {epoch}") - epoch += 1 + # logger.info(f"EPOCH: {epoch}") + # epoch += 1 # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 8db857bca9b72..ac6ec1ce44deb 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -128,7 +128,7 @@ def from_eco( read_offset=read_offset, skip_special_tokens=True, spaces_between_special_tokens=True, - output_kind=RequestOutputKind.CUMULATIVE, + output_kind=RequestOutputKind.DELTA, request_id=eco.request_id, prompt=eco.prompt, prompt_token_ids=eco.prompt_token_ids, @@ -458,8 +458,8 @@ def run_busy_loop(self): epoch = 0 while True: - logger.info(f"EPOCH: {epoch}") - epoch += 1 + # logger.info(f"EPOCH: {epoch}") + # epoch += 1 socks = dict(poller.poll()) From c5dacd4a2b81f4e6e7e7de184d5bf4100bd642e3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 21:35:51 +0000 Subject: [PATCH 041/132] remove output kind from api server --- vllm/entrypoints/openai/serving_completion.py | 48 ++----------------- 1 file changed, 5 insertions(+), 43 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 19c6f59cdd2aa..e9a5a55079ca4 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -187,7 +187,6 @@ async def create_completion( num_prompts=num_prompts, tokenizer=tokenizer, request_metadata=request_metadata, - output_kind=sampling_params.output_kind, ) # Non-streaming response @@ -246,32 +245,7 @@ async def completion_stream_generator( num_prompts: int, tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, - output_kind: RequestOutputKind, ) -> AsyncGenerator[str, None]: - """ - In V0, we use RequestOutputType.DELTA and each RequestOutput - from the result_generator is guaranteed to correspond to - a single token so can construct the outputs without needing - to maintain any state. - - In V1, we use RequestOutputType.CUMULATIVE and each RequestOutput - from the result_generator is not guaranteed to correspond to - a single token (it could correspond to 2+ tokens). - - To handle this, we need to maintain state around how many - characters and tokens have been returned so far, and dynamically - stream back just the delta (where the delta could be the text - corresponding to N tokens). - - We do this to dynamically adjust how much work the API server - is doing. If the QPS is high and streaming becomes a bottleneck, - such that the API server falls behind, we dynamically fall back - to streaming chunks of tokens. - """ - - assert (output_kind == RequestOutputKind.CUMULATIVE - or output_kind == RequestOutputKind.DELTA) - num_choices = 1 if request.n is None else request.n previous_text_lens = [0] * num_choices * num_prompts previous_num_tokens = [0] * num_choices * num_prompts @@ -327,17 +301,9 @@ async def completion_stream_generator( ] has_echoed[i] = True else: - if output_kind == RequestOutputKind.CUMULATIVE: - delta_text = output.text[previous_text_lens[i]:] - delta_token_ids = output.token_ids[ - previous_num_tokens[i]:] - out_logprobs = ( - output.logprobs[previous_num_tokens[i]:] - if output.logprobs else None) - else: - delta_text = output.text - delta_token_ids = output.token_ids - out_logprobs = output.logprobs + delta_text = output.text + delta_token_ids = output.token_ids + out_logprobs = output.logprobs if not delta_text and not delta_token_ids \ and not previous_num_tokens[i]: @@ -357,12 +323,8 @@ async def completion_stream_generator( else: logprobs = None - if output_kind == RequestOutputKind.CUMULATIVE: - previous_text_lens[i] = len(output.text) - previous_num_tokens[i] = len(output.token_ids) - else: - previous_text_lens[i] += len(output.text) - previous_num_tokens[i] += len(output.token_ids) + previous_text_lens[i] += len(output.text) + previous_num_tokens[i] += len(output.token_ids) finish_reason = output.finish_reason stop_reason = output.stop_reason From 23d3e60ceacd0c06e41d4781a6424bd8290a780e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 21:58:16 +0000 Subject: [PATCH 042/132] updated --- benchmarks/backend_request_func.py | 4 +--- benchmarks/benchmark_throughput.py | 19 ++++++++----------- vllm/v1/engine/async_llm.py | 6 +++++- vllm/v1/engine/core.py | 4 ++-- vllm/v1/engine/detokenizer.py | 5 ++--- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 1374768dc3def..b67849038cf0d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -242,9 +242,7 @@ async def async_request_openai_completions( "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, - # "ignore_eos": request_func_input.ignore_eos, - "ignore_eos": False, - + "ignore_eos": request_func_input.ignore_eos, } if request_func_input.extra_body: payload.update(request_func_input.extra_body) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 12ee9798d827e..69e82099e4506 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -198,27 +198,21 @@ async def run_vllm_async( max_tokens=request.expected_output_len, )) + async def run(generator): + async for res in generator: + pass + tasks = [] start = time.perf_counter() for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): generator = llm.generate(prompt, sp, request_id=f"test{i}") tasks.append(run(generator)) - # all_gens = merge_async_iterators(*generators) - # async for i, res in all_gens: - # pass - from aiodebug import log_slow_callbacks - loop = asyncio.get_event_loop() - log_slow_callbacks.enable(0.05) await asyncio.gather(*tasks) end = time.perf_counter() return end - start -async def run(generator): - async for res in generator: - pass - def run_hf( requests: List[SampleRequest], model: str, @@ -371,7 +365,10 @@ def main(args: argparse.Namespace): # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s") + f"{total_output_tokens / elapsed_time:.2f} output tokens/s, " + f"{(total_num_tokens - total_output_tokens) / len(requests)} input tokens/req, " + f"{(total_output_tokens) / len(requests)} output tokens/req, " + ) # Output JSON results if specified if args.output_json: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d53649fb0ffb2..f998aab95e4dd 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -92,7 +92,6 @@ def __init__( # self.output_handler: Optional[asyncio.Task] = None self.to_create_loop = True - self.epoch = 0 def __del__(self): self.shutdown() @@ -244,6 +243,8 @@ def signal_handler(self, signum=None, frame=None): while True: try: out = await asyncio.wait_for(queue.get(), timeout=4) + + logger.info(f"{queue.qsize()=}") if out.finished: del self.rid_to_queue[request_id] yield out @@ -293,7 +294,10 @@ def signal_handler(self, signum=None, frame=None): async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + epoch = 0 while True: + logger.info(f"EPOCH: {epoch}") + epoch+=1 # 1) Pull outputs from the Detokenizer. outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a3f294bffa064..f8aef10908514 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -247,8 +247,8 @@ def run_busy_loop(self): # Loop until process is sent a SIGINT or SIGTERM epoch = 0 while True: - # logger.info(f"EPOCH: {epoch}") - # epoch += 1 + logger.info(f"EPOCH: {epoch}") + epoch += 1 # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index ac6ec1ce44deb..92fac83c4455a 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -231,7 +231,6 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str: return self.output_text[last_offset:length] return "" -import time class Detokenizer: def __init__(self, @@ -458,8 +457,8 @@ def run_busy_loop(self): epoch = 0 while True: - # logger.info(f"EPOCH: {epoch}") - # epoch += 1 + logger.info(f"EPOCH: {epoch}") + epoch += 1 socks = dict(poller.poll()) From 3acf5c261f9b2682f6f6dbcd7ffd340a02e4a4ac Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 21:59:39 +0000 Subject: [PATCH 043/132] cleanup --- benchmarks/benchmark_throughput.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 69e82099e4506..0695a8579c49c 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,6 +6,7 @@ import time from typing import List, Optional +import asyncio import torch import uvloop from PIL import Image @@ -170,7 +171,6 @@ def run_vllm( end = time.perf_counter() return end - start -import asyncio async def run_vllm_async( requests: List[SampleRequest], n: int, @@ -334,8 +334,7 @@ def main(args: argparse.Namespace): for request in requests) if args.backend == "vllm": if args.async_engine: - asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) - elapsed_time = asyncio.run( + elapsed_time = uvloop.run( run_vllm_async( requests, args.n, From 84ff3c2e1cfe5d8709bf50bfd231c2b339fa89c2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:00:55 +0000 Subject: [PATCH 044/132] cleanup --- benchmarks/benchmark_throughput.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 0695a8579c49c..dac6dcc959a94 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -171,6 +171,7 @@ def run_vllm( end = time.perf_counter() return end - start + async def run_vllm_async( requests: List[SampleRequest], n: int, @@ -198,21 +199,19 @@ async def run_vllm_async( max_tokens=request.expected_output_len, )) - async def run(generator): - async for res in generator: - pass - - tasks = [] + generators = [] start = time.perf_counter() for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): generator = llm.generate(prompt, sp, request_id=f"test{i}") - tasks.append(run(generator)) - - await asyncio.gather(*tasks) - + generators.append(generator) + + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass end = time.perf_counter() return end - start + def run_hf( requests: List[SampleRequest], model: str, From ddf14264b15c67784aabbfa9c7bf723bbc5ea098 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:02:02 +0000 Subject: [PATCH 045/132] updated --- benchmarks/benchmark_throughput.py | 2 -- vllm/entrypoints/openai/api_server.py | 1 - vllm/entrypoints/openai/protocol.py | 1 - 3 files changed, 4 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index dac6dcc959a94..16ca509c12d18 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,7 +6,6 @@ import time from typing import List, Optional -import asyncio import torch import uvloop from PIL import Image @@ -204,7 +203,6 @@ async def run_vllm_async( for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): generator = llm.generate(prompt, sp, request_id=f"test{i}") generators.append(generator) - all_gens = merge_async_iterators(*generators) async for i, res in all_gens: pass diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c59c075e0ae2e..090c610d3a008 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -494,7 +494,6 @@ def build_app(args: Namespace) -> FastAPI: app = FastAPI(lifespan=lifespan) app.include_router(router) app.root_path = args.root_path - app.count = 0 mount_metrics(app) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 8bae4dadbe625..ff7bbb8da80cc 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -10,7 +10,6 @@ from typing_extensions import Annotated from vllm.entrypoints.chat_utils import ChatCompletionMessageParam -from vllm.envs import VLLM_USE_V1 from vllm.logger import init_logger from vllm.pooling_params import PoolingParams from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, From 895fd0d38cf446365b17455b3e06449c2462587c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:02:50 +0000 Subject: [PATCH 046/132] updated --- vllm/v1/engine/async_llm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index f998aab95e4dd..10a19007be117 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -244,7 +244,9 @@ def signal_handler(self, signum=None, frame=None): try: out = await asyncio.wait_for(queue.get(), timeout=4) - logger.info(f"{queue.qsize()=}") + q_size = queue.qsize() + if len(q_size) > 0: + logger.info(f"{q_size}") if out.finished: del self.rid_to_queue[request_id] yield out From 1184615cc632a20cff31b5d431ec2c756074627b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:03:26 +0000 Subject: [PATCH 047/132] cleanup --- vllm/entrypoints/openai/protocol.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index ff7bbb8da80cc..6ed7c2e9dcd6b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -37,6 +37,7 @@ assert _LONG_INFO.min == _MOCK_LONG_INFO.min assert _LONG_INFO.max == _MOCK_LONG_INFO.max + class OpenAIBaseModel(BaseModel): # OpenAI API does allow extra fields model_config = ConfigDict(extra="allow") From 7da9b1ad83ee9a8e34b0ef9f9b50aa72eccfaf73 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:04:01 +0000 Subject: [PATCH 048/132] cleanup --- vllm/entrypoints/openai/serving_completion.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index e9a5a55079ca4..607c30b55ea45 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -27,8 +27,7 @@ PromptAdapterPath) from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.sampling_params import (BeamSearchParams, RequestOutputKind, - SamplingParams) +from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import merge_async_iterators @@ -174,7 +173,6 @@ async def create_completion( stream = (request.stream and (request.best_of is None or request.n == request.best_of) and not request.use_beam_search) - assert isinstance(sampling_params, SamplingParams) # Streaming response if stream: @@ -186,8 +184,7 @@ async def create_completion( model_name, num_prompts=num_prompts, tokenizer=tokenizer, - request_metadata=request_metadata, - ) + request_metadata=request_metadata) # Non-streaming response final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts From f61c26a167c4021325d3400eddfbb6804f97beef Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:04:43 +0000 Subject: [PATCH 049/132] cleanup --- vllm/entrypoints/openai/serving_completion.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 607c30b55ea45..72b98dab345a3 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -319,7 +319,6 @@ async def completion_stream_generator( ) else: logprobs = None - previous_text_lens[i] += len(output.text) previous_num_tokens[i] += len(output.token_ids) @@ -379,7 +378,6 @@ async def completion_stream_generator( yield f"data: {data}\n\n" yield "data: [DONE]\n\n" - def request_output_to_completion_response( self, final_res_batch: List[RequestOutput], From 4e3de90c40b0f1799db4333dd911ed2138984e31 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:05:57 +0000 Subject: [PATCH 050/132] updated --- vllm/v1/engine/__init__.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index f81869a46b837..1482505e88490 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -35,31 +35,6 @@ class DetokenizerRequest( include_stop_str_in_output: bool -class DetokenizerOutput( - msgspec.Struct, - array_like=True, # type: ignore[call-arg] - omit_defaults=True, # type: ignore[call-arg] - gc=False): # type: ignore[call-arg] - - request_id: str - text: str - token_ids: List[int] - finished: bool - - -class DetokenizerOutputs( - msgspec.Struct, - array_like=True, # type: ignore[call-arg] - omit_defaults=True, # type: ignore[call-arg] - gc=False): # type: ignore[call-arg] - - #NOTE(Nick): We could consider ways to make this more compact, - # e.g. columnwise layout and using an int enum for finish/stop reason - - # [num_reqs] - outputs: List[DetokenizerOutput] - - @dataclass class EngineCoreRequest: From 07e4fa2d2b6d05a32f2257bc8c2b22ee946dfdf7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:06:52 +0000 Subject: [PATCH 051/132] updated --- vllm/v1/engine/core.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f8aef10908514..ebfa9b14d90e7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -338,13 +338,8 @@ def process_output_socket(self, output_path: str): buffer = bytearray() with zmq_socket_ctx(output_path, zmq.PUSH) as socket: - idx = 0 while True: engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) encoder.encode_into(outputs, buffer) - # msg = (DetokenizerRequestType.OUT.value, buffer) - msg = (buffer, ) - # logger.info(f"SEND: {idx}: {len(engine_core_outputs)}") - # idx += 1 - socket.send_multipart(msg, copy=False) + socket.send_multipart((buffer,), copy=False) From 2022a4f4f8a55485dc73cb3fd50daac41eb3cc64 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:09:46 +0000 Subject: [PATCH 052/132] format --- vllm/outputs.py | 24 ------------------------ vllm/v1/engine/__init__.py | 1 - vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/detokenizer.py | 6 ++---- 4 files changed, 3 insertions(+), 30 deletions(-) diff --git a/vllm/outputs.py b/vllm/outputs.py index 9a4b4353deb1d..b2f869b862a6a 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -12,7 +12,6 @@ from vllm.sampling_params import RequestOutputKind from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, SequenceGroup, SequenceGroupBase, SequenceStatus) -from vllm.v1.engine import DetokenizerOutput @dataclass @@ -133,29 +132,6 @@ def __init__( self.encoder_prompt_token_ids = encoder_prompt_token_ids self.num_cached_tokens = num_cached_tokens - @classmethod - def from_detok( - cls, - prompt: str, - prompt_token_ids: List[int], - detok_output: DetokenizerOutput, - ): - completion_output = CompletionOutput( - index=0, - text=detok_output.text, - token_ids=detok_output.token_ids, - cumulative_logprob=None, - logprobs=None, # TODO - ) - - return RequestOutput( - request_id=detok_output.request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - prompt_logprobs=None, # TODO - outputs=[completion_output], - finished=detok_output.finished, - ) @classmethod def new( diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 1482505e88490..367aee130cd75 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -69,7 +69,6 @@ class EngineCoreOutput( finished: bool finish_reason: Optional[str] = None stop_reason: Union[int, str, None] = None - class EngineCoreOutputs( diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 10a19007be117..003db30adbc12 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -245,7 +245,7 @@ def signal_handler(self, signum=None, frame=None): out = await asyncio.wait_for(queue.get(), timeout=4) q_size = queue.qsize() - if len(q_size) > 0: + if q_size > 0: logger.info(f"{q_size}") if out.finished: del self.rid_to_queue[request_id] diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 92fac83c4455a..db65d30f443b1 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -14,10 +14,8 @@ AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import get_open_zmq_ipc_path, kill_process_tree -from vllm.v1.engine import (DetokenizerRequest, DetokenizerOutputs, - DetokenizerOutput, - EngineCoreOutput, EngineCoreOutputs, - BackgroundProcHandle,) +from vllm.v1.engine import (DetokenizerRequest, EngineCoreOutput, + EngineCoreOutputs, BackgroundProcHandle,) from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, wait_for_startup) from vllm.v1.serial_utils import PickleEncoder From 10c7092c637cac12e87ef1505227545e5d7a5c7c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:10:10 +0000 Subject: [PATCH 053/132] updated --- vllm/v1/engine/detokenizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index db65d30f443b1..89757501fe919 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -287,12 +287,11 @@ def add_request_eco( def step( self, encore_core_outputs: List[EngineCoreOutput] - ) -> DetokenizerOutputs: + ) -> List[RequestOutput]: """Update state and request the RequestOutputs to the LLMEngine.""" request_outputs: List[RequestOutput] = [] # requests_to_abort: List[str] = [] - # detokenizer_outputs = DetokenizerOutputs(outputs=[]) for engine_core_output in encore_core_outputs: request_id = engine_core_output.request_id From a0620ac457e8157d70c8de11834d4f465ea3464d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:10:39 +0000 Subject: [PATCH 054/132] cleanup --- vllm/v1/engine/detokenizer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 89757501fe919..1a19d99b7d6bc 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -485,8 +485,6 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): # Serialization setup. self.encoder = msgspec.msgpack.Encoder() - # self.decoder = PickleEncoder() - self.decoder = msgspec.msgpack.Decoder(DetokenizerOutputs) # ZMQ setup. self.ctx = zmq.asyncio.Context(2) @@ -533,7 +531,4 @@ async def add_request_async(self, request: DetokenizerRequest): async def get_output_async(self) -> List[RequestOutput]: """Get RequestOutputs, RequestsToAbort from Detokenizer.""" - # (frame, ) = await self.output_socket.recv_multipart(copy=False) - # return self.decoder.decode(frame.buffer) return await self.output_socket.recv_pyobj() - # return self.decoder.decode(msg) From 12b3e066d1cce642cac958a8903051d69d8c4233 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:15:11 +0000 Subject: [PATCH 055/132] more cleanup --- vllm/entrypoints/openai/serving_completion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 72b98dab345a3..11278c8421c3c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -319,9 +319,9 @@ async def completion_stream_generator( ) else: logprobs = None + previous_text_lens[i] += len(output.text) previous_num_tokens[i] += len(output.token_ids) - finish_reason = output.finish_reason stop_reason = output.stop_reason @@ -431,6 +431,7 @@ def request_output_to_completion_response( output_text = prompt_text + output.text else: + # return just the delta token_ids = output.token_ids out_logprobs = output.logprobs output_text = output.text From e0926641f6ebda168efb2af64d58e1e80a7d968d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:15:40 +0000 Subject: [PATCH 056/132] more cleanup --- vllm/entrypoints/openai/serving_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 11278c8421c3c..d87c410c0124c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -298,6 +298,7 @@ async def completion_stream_generator( ] has_echoed[i] = True else: + # return just the delta delta_text = output.text delta_token_ids = output.token_ids out_logprobs = output.logprobs @@ -431,7 +432,6 @@ def request_output_to_completion_response( output_text = prompt_text + output.text else: - # return just the delta token_ids = output.token_ids out_logprobs = output.logprobs output_text = output.text From 32df238cdb0dddd06d3c9067ac8c48b119c2f97a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:16:15 +0000 Subject: [PATCH 057/132] updated --- vllm/outputs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/outputs.py b/vllm/outputs.py index b2f869b862a6a..2ecdf74ee59b3 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -132,7 +132,6 @@ def __init__( self.encoder_prompt_token_ids = encoder_prompt_token_ids self.num_cached_tokens = num_cached_tokens - @classmethod def new( cls, From 7dff86359632c98f30f0d0d5a5cca6d20f9b03cf Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:16:52 +0000 Subject: [PATCH 058/132] more cleanup --- vllm/v1/engine/__init__.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 367aee130cd75..c95bc22a9aaab 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -89,15 +89,6 @@ class EngineCoreProfile: is_start: bool -class DetokenizerRequestType(enum.Enum): - """ - Request types defined as hex byte strings, so it can be sent over sockets - without separate encoding step. - """ - NEW = b'\x00' - OUT = b'\x01' - - class EngineCoreRequestType(enum.Enum): """ Request types defined as hex byte strings, so it can be sent over sockets From 103729d5b186cd1307d226eaf21622f6bf94b150 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:41:21 +0000 Subject: [PATCH 059/132] updated --- vllm/v1/engine/__init__.py | 27 ++---------------- vllm/v1/engine/async_llm.py | 32 ++++++++------------- vllm/v1/engine/core.py | 5 ++-- vllm/v1/engine/detokenizer.py | 54 +++++++++++++++-------------------- vllm/v1/engine/processor.py | 20 +++---------- 5 files changed, 43 insertions(+), 95 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index c95bc22a9aaab..df0001679e555 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -18,33 +18,10 @@ class BackgroundProcHandle: output_path: str -class DetokenizerRequest( - msgspec.Struct, - array_like=True, # type: ignore[call-arg] - omit_defaults=True, # type: ignore[call-arg] - gc=False): # type: ignore[call-arg] - - request_id: str - prompt: Optional[str] - prompt_token_ids: List[int] - skip_special_tokens: bool - spaces_between_special_tokens: bool - output_kind: RequestOutputKind - - stop: List[str] - include_stop_str_in_output: bool - - @dataclass -class EngineCoreRequest: - - # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput, - # but this object is currently not playing well with msgspec - # due to circular imports and typing we have in data.py +class EngineRequest: request_id: str - #NOTE(Nick): I don't think we need to pass prompt here since it should - # always be tokenized? prompt: Optional[str] prompt_token_ids: List[int] mm_inputs: Optional[List[Optional[MultiModalKwargs]]] @@ -99,4 +76,4 @@ class EngineCoreRequestType(enum.Enum): PROFILE = b'\x02' -EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]] +EngineCoreRequestUnion = Union[EngineRequest, EngineCoreProfile, List[str]] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 003db30adbc12..8dcf8de1a092e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -13,7 +13,7 @@ from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext @@ -165,22 +165,16 @@ async def add_request( ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" - # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. - _, engine_core_req = self.processor.process_inputs( + # 1) Convert Input --> EngineRequest. + engine_request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 1) Add to RequestState tracker. The "event" is used to manage - # concurrency between generate() and output_handler(). + # 2) Create Queue (output_handler pushes, generate pulls) self.rid_to_queue[request_id] = asyncio.Queue() - # 3) Add the DetokenizerRequest to Detokenizer. - # TODO: sending these separately is a race condition. We should instead - # have the EngineCore do the "AddRequest" logic. - # await self.detokenizer.add_request_async(detokenizer_req) - - # 4) Add the EngineCoreRequest to EngineCore. - await self.engine_core.add_request_async(engine_core_req) + # 3) Send to Detokenizer. + await self.detokenizer.add_request_async(engine_request) return self.rid_to_queue[request_id] @@ -201,16 +195,15 @@ async def generate( ) -> AsyncGenerator[RequestOutput, None]: """ Main function called by the API server to kick off a request - * 1) Make RequestState corresponding to the Request. + * 1) Make a queue corresponding to the Request. # 2) Processing the Input. - * 3) Adding the Request to the Detokenizer. - * 4) Adding the Request to the EngineCore (separate process). + * 3) Adding the Request to the Detokenize + EngineCore. - The output_handler() loop runs in a background task, pulling from - EngineCore and updating the RequestState and setting the asyncio event. + The output_handler() loop runs in a background task, pulling + from Detokenizer and pushing to the per request queue. - The caller of generate() waits on the asyncio event and forwards - the latest RequestOutput back to the caller. + The generate() pulls from the per requests queue and yeilds + to the caller which iterates the AsyncGenerator. """ # We start the output_handler on the first call to generate() so that @@ -218,7 +211,6 @@ async def generate( # to handle startup failure gracefully in the OpenAI server. # if self.output_handler is None: if self.to_create_loop: - import signal def signal_handler(self, signum=None, frame=None): logger.warning( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ebfa9b14d90e7..e35e5ac464305 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -18,9 +18,8 @@ from vllm.utils import get_open_zmq_ipc_path from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, + EngineCoreProfile, EngineRequest, EngineCoreRequestType, EngineCoreRequestUnion, - DetokenizerRequestType, BackgroundProcHandle) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor @@ -89,7 +88,7 @@ def _initialize_kv_caches(self, "warmup model) took %.2f seconds"), elapsed) return num_gpu_blocks, num_cpu_blocks - def add_request(self, request: EngineCoreRequest): + def add_request(self, request: EngineRequest): """Add request to the scheduler.""" if request.mm_hashes is not None: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 1a19d99b7d6bc..c265cc984dc5e 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -14,11 +14,10 @@ AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import get_open_zmq_ipc_path, kill_process_tree -from vllm.v1.engine import (DetokenizerRequest, EngineCoreOutput, - EngineCoreOutputs, BackgroundProcHandle,) +from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, + BackgroundProcHandle, EngineRequest) from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, wait_for_startup) -from vllm.v1.serial_utils import PickleEncoder logger = init_logger(__name__) @@ -66,19 +65,20 @@ def output_token_ids(self) -> List[int]: def from_new_request( cls, tokenizer: AnyTokenizer, - request: DetokenizerRequest, + request: EngineRequest, ) -> "IncrementalDetokenizer": + sampling_params = request.sampling_params tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( tokenizer=tokenizer, prompt_ids=request.prompt_token_ids, - skip_special_tokens=request.skip_special_tokens, + skip_special_tokens=sampling_params.skip_special_tokens, ) - stops = request.stop + stops = request.sampling_params # Number of chars to hold back when stop strings are to be excluded # from streamed output. - if stops and not request.include_stop_str_in_output: + if stops and not sampling_params.include_stop_str_in_output: stop_buffer_length = max(len(s) for s in stops) - 1 else: stop_buffer_length = 0 @@ -90,13 +90,13 @@ def from_new_request( # NOTE(Nick): could we take ownership of it though? token_ids=request.prompt_token_ids.copy(), stop=stops, - include_stop_str_in_output=request.include_stop_str_in_output, + include_stop_str_in_output=sampling_params.include_stop_str_in_output, prefix_offset=prefix_offset, read_offset=read_offset, - skip_special_tokens=request.skip_special_tokens, - spaces_between_special_tokens=request. + skip_special_tokens=sampling_params.skip_special_tokens, + spaces_between_special_tokens=sampling_params. spaces_between_special_tokens, - output_kind=request.output_kind, + output_kind=sampling_params.output_kind, request_id=request.request_id, prompt=request.prompt, prompt_token_ids=request.prompt_token_ids, @@ -266,7 +266,7 @@ def abort_requests( def add_request( self, - request: DetokenizerRequest, + request: EngineRequest, ): """Add new request to the Detokenizer.""" @@ -428,29 +428,24 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the Detokenizer.""" - log_interval = 0 - import time - - last_log = time.perf_counter() try: # TODO: handle aborted due to client cancellation # TODO: pickle -> msgpack # TODO: send stop string aborts back to EngineCore directly - decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest) + decoder_new = msgspec.msgpack.Decoder(EngineRequest) decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs) - encoder = msgspec.msgpack.Encoder() - with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as engine_core_outputs_socket, - zmq_socket_ctx(self.input_path, zmq.PULL) as input_socket, + with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, + zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine, zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket): # TODO: avoid poll by having both EngineCore # and AsyncLLM send to the same socket (unclear why this # was not working when I originally tried it) poller = zmq.Poller() - poller.register(engine_core_outputs_socket, zmq.POLLIN) - poller.register(input_socket, zmq.POLLIN) + poller.register(from_engine_core, zmq.POLLIN) + poller.register(from_llm_engine, zmq.POLLIN) epoch = 0 while True: @@ -460,19 +455,16 @@ def run_busy_loop(self): socks = dict(poller.poll()) # Handle NewRequest. - if input_socket in socks: - (frame, ) = input_socket.recv_multipart(copy=False) - detokenizer_request = decoder_new.decode(frame.buffer) - self.add_request(detokenizer_request) + if from_llm_engine in socks: + (frame, ) = from_llm_engine.recv_multipart(copy=False) + engine_request = decoder_new.decode(frame.buffer) + self.add_request(engine_request) # Handle EngineCoreOutput. - if engine_core_outputs_socket in socks: - (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False) + if from_engine_core in socks: + (frame, ) = from_engine_core.recv_multipart(copy=False) engine_core_outputs = decoder_out.decode(frame.buffer).outputs request_outputs, _ = self.step(engine_core_outputs) - # msg = encoder.encode(detokenizer_outputs) - # # output_socket.send_multipart((msg, ), copy=False) - # output_socket.send(msg) output_socket.send_pyobj(request_outputs) except Exception as e: diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 679bf8e25e9ca..d37eab3418c3e 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -14,7 +14,7 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.config import try_get_generation_config from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup -from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest +from vllm.v1.engine import DetokenizerData, EngineRequest from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient @@ -61,7 +61,7 @@ def process_inputs( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: + ) -> EngineRequest: # TODO(woosuk): Support pooling models. # TODO(woosuk): Check max_logprobs @@ -122,20 +122,8 @@ def process_inputs( decoder_inputs.multi_modal_data, mm_hashes, decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs) - # Make Request for Detokenizer. - detokenizer_request = DetokenizerRequest( - request_id, - decoder_inputs.prompt, - decoder_inputs.prompt_token_ids, - sampling_params.skip_special_tokens, - sampling_params.spaces_between_special_tokens, - sampling_params.output_kind, - sampling_params.stop, - sampling_params.include_stop_str_in_output, - ) - # Make Request for EngineCore. - engine_core_request = EngineCoreRequest( + engine_request = EngineRequest( request_id, decoder_inputs.prompt, decoder_inputs.prompt_token_ids, @@ -148,7 +136,7 @@ def process_inputs( lora_request, ) - return detokenizer_request, engine_core_request + return engine_request def _validate_model_inputs(self, inputs: ProcessorInputs): if is_encoder_decoder_inputs(inputs): From 380086c0b565403a5489bf489ffa560901384254 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 22:47:01 +0000 Subject: [PATCH 060/132] updated --- vllm/v1/engine/detokenizer.py | 47 ++--------------------------------- 1 file changed, 2 insertions(+), 45 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index c265cc984dc5e..91b5f7a9c1800 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -104,37 +104,6 @@ def from_new_request( stop_buffer_length=stop_buffer_length, ) - @classmethod - def from_eco( - cls, - tokenizer: AnyTokenizer, - eco: EngineCoreOutput, - ): - tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( - tokenizer=tokenizer, - prompt_ids=eco.prompt_token_ids, - skip_special_tokens=True, - ) - - return cls( - output_text="", - tokens=tokens, - token_ids=eco.prompt_token_ids, - stop=[], - include_stop_str_in_output=False, - prefix_offset=prefix_offset, - read_offset=read_offset, - skip_special_tokens=True, - spaces_between_special_tokens=True, - output_kind=RequestOutputKind.DELTA, - request_id=eco.request_id, - prompt=eco.prompt, - prompt_token_ids=eco.prompt_token_ids, - tokenizer=tokenizer, - stop_buffer_length=0, - ) - - def add_tokens( self, new_token_ids: List[int], @@ -275,14 +244,6 @@ def add_request( request_state = IncrementalDetokenizer.from_new_request( self.tokenizer, request) self.request_states[request.request_id] = request_state - - def add_request_eco( - self, - eco: EngineCoreOutput, - ): - request_state = IncrementalDetokenizer.from_eco( - self.tokenizer, eco) - self.request_states[eco.request_id] = request_state def step( @@ -296,9 +257,6 @@ def step( for engine_core_output in encore_core_outputs: request_id = engine_core_output.request_id - if request_id not in self.request_states: - self.add_request_eco(engine_core_output) - detokenizer = self.request_states.get(request_id) if detokenizer is None: # Ignore output for already-aborted request. @@ -514,11 +472,10 @@ def shutdown(self): if self.proc_handle.proc.is_alive(): kill_process_tree(self.proc_handle.proc.pid) - async def add_request_async(self, request: DetokenizerRequest): + async def add_request_async(self, request: EngineRequest): """Send new DetokenizerRequest to Detokenizer.""" - msg = (self.encoder.encode(request), ) - await self.input_socket.send_multipart(msg, copy=False) + await self.input_socket.send_pyobj(request) async def get_output_async(self) -> List[RequestOutput]: """Get RequestOutputs, RequestsToAbort from Detokenizer.""" From 6f0adfee1dbae03e2ec3e40b0ac453c854ae645b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 23:15:26 +0000 Subject: [PATCH 061/132] working again --- tests/v1/engine/test_engine_core.py | 6 +- tests/v1/engine/test_engine_core_client.py | 6 +- vllm/v1/engine/__init__.py | 4 +- vllm/v1/engine/async_llm.py | 11 ++- vllm/v1/engine/core.py | 38 ++++---- vllm/v1/engine/core_client.py | 101 +++------------------ vllm/v1/engine/detokenizer.py | 51 ++++++----- vllm/v1/engine/llm_engine.py | 4 +- vllm/v1/engine/processor.py | 2 +- vllm/v1/request.py | 4 +- 10 files changed, 81 insertions(+), 146 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index a61ec63a365b5..5c9bfa02a5b0f 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -8,7 +8,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext -from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine import EngineRequest from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core import EngineCore @@ -22,8 +22,8 @@ PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids -def make_request() -> EngineCoreRequest: - return EngineCoreRequest( +def make_request() -> EngineRequest: + return EngineRequest( request_id=uuid.uuid4(), prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 2f1cbec607a91..20db30e8b1223 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -10,7 +10,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext -from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine import EngineRequest from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core_client import EngineCoreClient @@ -24,8 +24,8 @@ PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids -def make_request(params: SamplingParams) -> EngineCoreRequest: - return EngineCoreRequest( +def make_request(params: SamplingParams) -> EngineRequest: + return EngineRequest( request_id=str(uuid.uuid4()), prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index df0001679e555..d55484ee524d8 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -66,7 +66,7 @@ class EngineCoreProfile: is_start: bool -class EngineCoreRequestType(enum.Enum): +class EngineRequestType(enum.Enum): """ Request types defined as hex byte strings, so it can be sent over sockets without separate encoding step. @@ -76,4 +76,4 @@ class EngineCoreRequestType(enum.Enum): PROFILE = b'\x02' -EngineCoreRequestUnion = Union[EngineRequest, EngineCoreProfile, List[str]] +EngineRequestUnion = Union[EngineRequest, EngineCoreProfile, List[str]] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 8dcf8de1a092e..6164fe1cf509a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -18,7 +18,7 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import get_open_zmq_ipc_path -from vllm.v1.engine.core_client import AsyncMPClient +from vllm.v1.engine.core_client import MultiprocessEngineCore from vllm.v1.engine.detokenizer import DetokenizerClient from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor @@ -64,18 +64,20 @@ def __init__( # List of cancelled request ids to be aborted. self.client_aborted_requests: List[str] = [] - # Processor (converts Inputs --> EngineCoreRequests). + # Processor (converts Inputs --> EngineRequest). self.processor = Processor(vllm_config.model_config, vllm_config.lora_config, self.tokenizer, input_registry) - # IPC path for EngineCore -> Detokenizer. + # IPC paths. engine_core_outputs_path = get_open_zmq_ipc_path() + engine_core_inputs_path = get_open_zmq_ipc_path() # Detokenizer (converts EngineCoreOutputs --> RequestOutput). self.detokenizer = DetokenizerClient( engine_core_outputs_path=engine_core_outputs_path, + engine_core_inputs_path=engine_core_inputs_path, tokenizer_name=vllm_config.model_config.tokenizer, tokenizer_mode=vllm_config.model_config.tokenizer_mode, trust_remote_code=vllm_config.model_config.trust_remote_code, @@ -83,7 +85,8 @@ def __init__( ) # EngineCore (starts the engine in background process). - self.engine_core = AsyncMPClient( + self.engine_core = MultiprocessEngineCore( + input_path=engine_core_inputs_path, output_path=engine_core_outputs_path, vllm_config=vllm_config, executor_class=executor_class, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e35e5ac464305..9e985be743e0d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -19,7 +19,7 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineRequest, - EngineCoreRequestType, EngineCoreRequestUnion, + EngineRequestType, EngineRequestUnion, BackgroundProcHandle) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor @@ -155,7 +155,7 @@ def __init__( # and to overlap some serialization/deserialization with the # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue() + self.input_queue: queue.Queue[EngineRequestUnion] = queue.Queue() self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue() threading.Thread(target=self.process_input_socket, args=(input_path, ), @@ -289,10 +289,10 @@ def _log_stats(self): self._last_logging_time = now - def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: - """Handle EngineCoreRequest or EngineCoreABORT from Client.""" + def _handle_client_request(self, request: EngineRequestUnion) -> None: + """Handle EngineRequest or EngineCoreABORT from Client.""" - if isinstance(request, EngineCoreRequest): + if isinstance(request, EngineRequest): self.add_request(request) elif isinstance(request, EngineCoreProfile): self.model_executor.profile(request.is_start) @@ -311,21 +311,23 @@ def process_input_socket(self, input_path: str): with zmq_socket_ctx(input_path, zmq.PULL) as socket: while True: # (RequestType, RequestData) - type_frame, data_frame = socket.recv_multipart(copy=False) - request_type = type_frame.buffer - request_data = data_frame.buffer - - # Deserialize the request data. - if request_type == EngineCoreRequestType.ADD.value: - request = decoder_add_req.decode(request_data) - elif request_type == EngineCoreRequestType.ABORT.value: - request = decoder_abort_req.decode(request_data) - elif request_type == EngineCoreRequestType.PROFILE.value: - request = pickle.loads(request_data) - else: - raise ValueError(f"Unknown RequestType: {request_type}") + # type_frame, data_frame = socket.recv_multipart(copy=False) + # request_type = type_frame.buffer + # request_data = data_frame.buffer + + + # # Deserialize the request data. + # if request_type == EngineRequestType.ADD.value: + # request = decoder_add_req.decode(request_data) + # elif request_type == EngineRequestType.ABORT.value: + # request = decoder_abort_req.decode(request_data) + # elif request_type == EngineRequestType.PROFILE.value: + # request = pickle.loads(request_data) + # else: + # raise ValueError(f"Unknown RequestType: {request_type}") # Push to input queue for core busy loop. + request = socket.recv_pyobj() self.input_queue.put_nowait(request) def process_output_socket(self, output_path: str): diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 7559ca1af2a03..d094e2de11b85 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -10,8 +10,8 @@ from vllm.utils import kill_process_tree, get_open_zmq_ipc_path from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion) + EngineCoreProfile, EngineRequest, + EngineRequestType, EngineRequestUnion) from vllm.v1.engine.core import (EngineCore, EngineCoreProc) from vllm.v1.serial_utils import PickleEncoder from vllm.v1.utils import make_zmq_socket @@ -58,7 +58,7 @@ def shutdown(self): def get_output(self) -> List[EngineCoreOutput]: raise NotImplementedError - def add_request(self, request: EngineCoreRequest) -> None: + def add_request(self, request: EngineRequest) -> None: raise NotImplementedError def profile(self, is_start: bool = True) -> None: @@ -70,7 +70,7 @@ def abort_requests(self, request_ids: List[str]) -> None: async def get_output_async(self) -> List[EngineCoreOutput]: raise NotImplementedError - async def add_request_async(self, request: EngineCoreRequest) -> None: + async def add_request_async(self, request: EngineRequest) -> None: raise NotImplementedError async def profile_async(self, is_start: bool = True) -> None: @@ -86,7 +86,7 @@ class InprocClient(EngineCoreClient): for use in LLMEngine for V0-style add_request() and step() EngineCore setup in this process (no busy loop). - * pushes EngineCoreRequest directly into the EngineCore + * pushes EngineRequest directly into the EngineCore * pulls EngineCoreOutputs by stepping the EngineCore TODO: support asyncio-mode for debugging. @@ -98,7 +98,7 @@ def __init__(self, *args, **kwargs): def get_output(self) -> List[EngineCoreOutput]: return self.engine_core.step() - def add_request(self, request: EngineCoreRequest) -> None: + def add_request(self, request: EngineRequest) -> None: self.engine_core.add_request(request) def abort_requests(self, request_ids: List[str]) -> None: @@ -114,53 +114,29 @@ def profile(self, is_start: bool = True) -> None: self.engine_core.profile(is_start) -class MPClient(EngineCoreClient): +class MultiprocessEngineCore: """ - MPClient: base client for multi-proc EngineCore. + MultiprocessEngineCore: base client for multi-proc EngineCore. EngineCore runs in a background process busy loop, getting - new EngineCoreRequests and returning EngineCoreOutputs + new EngineRequests and returning EngineCoreOutputs - * pushes EngineCoreRequests via input_socket + * pushes EngineRequests via input_socket * pulls EngineCoreOutputs via output_socket - - * AsyncMPClient subclass for AsyncLLM usage - * SyncMPClient subclass for LLM usage """ def __init__( self, *args, - asyncio_mode: bool, + input_path: Optional[str] = None, output_path: Optional[str] = None, **kwargs, ): - # Serialization setup. - self.encoder = PickleEncoder() - self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) - - # ZMQ setup. - if asyncio_mode: - print("HERE HERE HERE") - self.ctx = zmq.asyncio.Context(io_threads=2) - else: - self.ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] - - input_path = get_open_zmq_ipc_path() - self.input_socket = make_zmq_socket( - self.ctx, - input_path, - zmq.PUSH, - ) - - if output_path is None: - output_path = get_open_zmq_ipc_path() - # Start EngineCore in background process. self.proc_handle: Optional[BackgroundProcHandle] self.proc_handle = EngineCoreProc.make_engine_core_process( *args, - input_path=input_path, - output_path=output_path, + input_path=(input_path or get_open_zmq_ipc_path()), + output_path=(output_path or get_open_zmq_ipc_path()), **kwargs, ) atexit.register(self.shutdown) @@ -172,9 +148,6 @@ def shutdown(self): # in case shutdown gets called via __del__ first atexit.unregister(self.shutdown) - # Shut down the zmq context. - self.ctx.destroy(linger=0) - if hasattr(self, "proc_handle") and self.proc_handle: # Shutdown the process if needed. if self.proc_handle.proc.is_alive(): @@ -197,51 +170,3 @@ def shutdown(self): def __del__(self): self.shutdown() - - -class SyncMPClient(MPClient): - """Synchronous client for multi-proc EngineCore.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, asyncio_mode=False, **kwargs) - - def _send_input(self, request_type: EngineCoreRequestType, - request: EngineCoreRequestUnion) -> None: - - # (RequestType, SerializedRequest) - msg = (request_type.value, self.encoder.encode(request)) - self.input_socket.send_multipart(msg, copy=False) - - def add_request(self, request: EngineCoreRequest) -> None: - self._send_input(EngineCoreRequestType.ADD, request) - - def abort_requests(self, request_ids: List[str]) -> None: - self._send_input(EngineCoreRequestType.ABORT, request_ids) - - def profile(self, is_start: bool = True) -> None: - self._send_input(EngineCoreRequestType.PROFILE, - EngineCoreProfile(is_start)) - - -class AsyncMPClient(MPClient): - """Asyncio-compatible client for multi-proc EngineCore.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, asyncio_mode=True, **kwargs) - - async def _send_input(self, request_type: EngineCoreRequestType, - request: EngineCoreRequestUnion) -> None: - - msg = (request_type.value, self.encoder.encode(request)) - await self.input_socket.send_multipart(msg, copy=False) - - async def add_request_async(self, request: EngineCoreRequest) -> None: - await self._send_input(EngineCoreRequestType.ADD, request) - - async def abort_requests_async(self, request_ids: List[str]) -> None: - if len(request_ids) > 0: - await self._send_input(EngineCoreRequestType.ABORT, request_ids) - - async def profile_async(self, is_start: bool = True) -> None: - await self._send_input(EngineCoreRequestType.PROFILE, - EngineCoreProfile(is_start)) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 91b5f7a9c1800..7c5ce6fbbfed9 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -3,7 +3,7 @@ import msgspec import signal from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import Dict, Iterable, List, Optional, Union from vllm.engine.output_processor.stop_checker import StopChecker from vllm.executor.multiproc_worker_utils import get_mp_context @@ -75,7 +75,7 @@ def from_new_request( skip_special_tokens=sampling_params.skip_special_tokens, ) - stops = request.sampling_params + stops = request.sampling_params.stop # Number of chars to hold back when stop strings are to be excluded # from streamed output. if stops and not sampling_params.include_stop_str_in_output: @@ -231,20 +231,7 @@ def abort_requests( """Remove the request_ids from the Detokenizer.""" for request_id in request_ids: - self.request_states.pop(request_id, None) - - def add_request( - self, - request: EngineRequest, - ): - """Add new request to the Detokenizer.""" - - assert (request.request_id not in self.request_states) - - request_state = IncrementalDetokenizer.from_new_request( - self.tokenizer, request) - self.request_states[request.request_id] = request_state - + self.request_states.pop(request_id, None) def step( self, encore_core_outputs: List[EngineCoreOutput] @@ -295,6 +282,7 @@ def __init__( self, *args, engine_core_outputs_path: str, + engine_core_inputs_path: str, input_path: str, output_path: str, ready_path: str, @@ -303,6 +291,7 @@ def __init__( super().__init__(*args, **kwargs) self.engine_core_outputs_path = engine_core_outputs_path + self.engine_core_inputs_path = engine_core_inputs_path self.input_path = input_path self.output_path = output_path @@ -314,6 +303,7 @@ def __init__( @staticmethod def make_detokenizer_process( engine_core_outputs_path: str, + engine_core_inputs_path: str, input_path: str, output_path: str, tokenizer_name: str, @@ -326,6 +316,7 @@ def make_detokenizer_process( process_kwargs = { "engine_core_outputs_path": engine_core_outputs_path, + "engine_core_inputs_path": engine_core_inputs_path, "input_path": input_path, "output_path": output_path, "ready_path": ready_path, @@ -391,12 +382,12 @@ def run_busy_loop(self): # TODO: pickle -> msgpack # TODO: send stop string aborts back to EngineCore directly - decoder_new = msgspec.msgpack.Decoder(EngineRequest) decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs) with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine, - zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket): + zmq_socket_ctx(self.engine_core_inputs_path, zmq.PUSH) as to_engine_core, + zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine): # TODO: avoid poll by having both EngineCore # and AsyncLLM send to the same socket (unclear why this @@ -414,16 +405,25 @@ def run_busy_loop(self): # Handle NewRequest. if from_llm_engine in socks: - (frame, ) = from_llm_engine.recv_multipart(copy=False) - engine_request = decoder_new.decode(frame.buffer) - self.add_request(engine_request) + pickled_request = from_llm_engine.recv() + request = pickle.loads(pickled_request) + + assert (request.request_id not in self.request_states) + + # Add to Detokenizer. + request_state = IncrementalDetokenizer.from_new_request( + self.tokenizer, request) + self.request_states[request.request_id] = request_state + + # Forward to EngineCore. + to_engine_core.send(pickled_request) # Handle EngineCoreOutput. if from_engine_core in socks: (frame, ) = from_engine_core.recv_multipart(copy=False) engine_core_outputs = decoder_out.decode(frame.buffer).outputs request_outputs, _ = self.step(engine_core_outputs) - output_socket.send_pyobj(request_outputs) + to_llm_engine.send_pyobj(request_outputs) except Exception as e: logger.error(e) @@ -431,7 +431,11 @@ def run_busy_loop(self): class DetokenizerClient: - def __init__(self, *args, engine_core_outputs_path: str, **kwargs): + def __init__(self, + *args, + engine_core_outputs_path: str, + engine_core_inputs_path: str, + **kwargs): # Serialization setup. self.encoder = msgspec.msgpack.Encoder() @@ -460,6 +464,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs): self.proc_handle = DetokenizerProc.make_detokenizer_process( *args, engine_core_outputs_path=engine_core_outputs_path, + engine_core_inputs_path=engine_core_inputs_path, input_path=input_path, output_path=output_path, **kwargs, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 15dedbd0f9529..1e508c5d240e1 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -54,7 +54,7 @@ def __init__( lora_config=vllm_config.lora_config) self.tokenizer.ping() - # Processor (convert Inputs --> EngineCoreRequests) + # Processor (convert Inputs --> EngineRequests) self.processor = Processor(vllm_config.model_config, vllm_config.lora_config, self.tokenizer, input_registry, mm_registry) @@ -67,7 +67,7 @@ def __init__( revision=vllm_config.model_config.tokenizer_revision, ) - # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) + # EngineCore (gets EngineRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( vllm_config, executor_class, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index d37eab3418c3e..60beecb24a61c 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -14,7 +14,7 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.config import try_get_generation_config from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup -from vllm.v1.engine import DetokenizerData, EngineRequest +from vllm.v1.engine import EngineRequest from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 1737d096e811d..e6de57dab7672 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -6,7 +6,7 @@ from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingParams from vllm.sequence import RequestMetrics -from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine import EngineRequest from vllm.v1.utils import ConstantList @@ -57,7 +57,7 @@ def __init__( self.mm_inputs = self.inputs.multi_modal_inputs @classmethod - def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": + def from_engine_core_request(cls, request: EngineRequest) -> "Request": return cls( request_id=request.request_id, inputs=token_inputs( From 5d2c9ae87e5d3055ce105398a6f8a01f73f85fc3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 23:34:19 +0000 Subject: [PATCH 062/132] design without incremental streaming seems okay --- vllm/v1/engine/async_llm.py | 7 ++----- vllm/v1/engine/detokenizer.py | 11 ++++------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6164fe1cf509a..d6e2bfe4fc04b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -240,8 +240,8 @@ def signal_handler(self, signum=None, frame=None): out = await asyncio.wait_for(queue.get(), timeout=4) q_size = queue.qsize() - if q_size > 0: - logger.info(f"{q_size}") + # if q_size > 0: + # logger.info(f"{q_size=}") if out.finished: del self.rid_to_queue[request_id] yield out @@ -253,9 +253,6 @@ def signal_handler(self, signum=None, frame=None): # TODO(rob): do request cancellation checking here. # logger.debug("Timeout waiting for %s", request_id) continue - - - # async def _process_cancellations(self) -> None: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 7c5ce6fbbfed9..6f03ec57e1105 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -399,20 +399,18 @@ def run_busy_loop(self): epoch = 0 while True: logger.info(f"EPOCH: {epoch}") - epoch += 1 socks = dict(poller.poll()) # Handle NewRequest. if from_llm_engine in socks: pickled_request = from_llm_engine.recv() - request = pickle.loads(pickled_request) + request: EngineRequest = pickle.loads(pickled_request) assert (request.request_id not in self.request_states) # Add to Detokenizer. - request_state = IncrementalDetokenizer.from_new_request( - self.tokenizer, request) + request_state = IncrementalDetokenizer.from_new_request(self.tokenizer, request) self.request_states[request.request_id] = request_state # Forward to EngineCore. @@ -420,6 +418,8 @@ def run_busy_loop(self): # Handle EngineCoreOutput. if from_engine_core in socks: + epoch += 1 + (frame, ) = from_engine_core.recv_multipart(copy=False) engine_core_outputs = decoder_out.decode(frame.buffer).outputs request_outputs, _ = self.step(engine_core_outputs) @@ -436,9 +436,6 @@ def __init__(self, engine_core_outputs_path: str, engine_core_inputs_path: str, **kwargs): - - # Serialization setup. - self.encoder = msgspec.msgpack.Encoder() # ZMQ setup. self.ctx = zmq.asyncio.Context(2) From 0574b89b8051f142f7c40089b201f71624fef183 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 21 Dec 2024 23:35:58 +0000 Subject: [PATCH 063/132] updated --- vllm/v1/core/scheduler.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index c83c931f75fea..f76364f64033d 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -140,7 +140,6 @@ def schedule(self) -> "SchedulerOutput": preempted_req.status = RequestStatus.PREEMPTED preempted_req.num_computed_tokens = 0 - logger.info(f"Preempted: {preempted_req.request_id}") self.waiting.appendleft(preempted_req) preempted_reqs.append(preempted_req) if preempted_req == request: @@ -425,12 +424,10 @@ def update_from_output( # Check for stop and update request state. # This must be called before me make the EngineCoreOutput. stopped = self._check_stop(request) - + # Add EngineCoreOutput for this Request. output = EngineCoreOutput( request_id=req_id, - prompt=request.prompt, - prompt_token_ids=request.prompt_token_ids, new_token_ids=request.output_token_ids[-num_new_tokens:], finished=request.is_finished(), finish_reason=request.get_finished_reason(), From 19a7cd011ea04936a2146dc847241dc987c9f90c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 13:50:04 +0000 Subject: [PATCH 064/132] updated --- benchmarks/backend_request_func.py | 4 ++++ vllm/v1/engine/__init__.py | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index b67849038cf0d..fb3e7c994d4d6 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -295,6 +295,7 @@ async def async_request_openai_completions( if first_chunk_received: output.success = True else: + print("error 0!") output.success = False output.error = ( "Never received a valid chunk to calculate TTFT." @@ -302,12 +303,15 @@ async def async_request_openai_completions( output.generated_text = generated_text output.latency = latency else: + print("error 1!") output.error = response.reason or "" output.success = False except Exception: + print("error 2!") output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + print(f"{output.error=}") if pbar: pbar.update(1) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index d55484ee524d8..9e4f8a9d6e29a 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -40,8 +40,6 @@ class EngineCoreOutput( gc=False): # type: ignore[call-arg] request_id: str - prompt_token_ids: List[int] - prompt: Optional[str] new_token_ids: List[int] finished: bool finish_reason: Optional[str] = None From 067d487b51556f84d3555eff24b4fd1a354b038a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 13:52:56 +0000 Subject: [PATCH 065/132] updated --- vllm/v1/engine/async_llm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d6e2bfe4fc04b..a300af75c2cf8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -25,9 +25,6 @@ logger = init_logger(__name__) -import uvloop -asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) - class AsyncLLM(EngineClient): From c1c8749fd16cd416216d95dbfe2eafc8811df32e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 13:54:56 +0000 Subject: [PATCH 066/132] more cleanup --- vllm/entrypoints/openai/serving_completion.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index d87c410c0124c..bd39a4c42e938 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -159,10 +159,8 @@ async def create_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - # result_generator = merge_async_iterators( - # *generators, is_cancelled=raw_request.is_disconnected) - assert len(generators) == 1 - result_generator = generators[0] + result_generator = merge_async_iterators( + *generators, is_cancelled=raw_request.is_disconnected) model_name = self._get_model_name(lora_request) num_prompts = len(engine_prompts) @@ -258,9 +256,7 @@ async def completion_stream_generator( include_usage, include_continuous_usage = False, False try: - # async for prompt_idx, res in result_generator: - async for res in result_generator: - prompt_idx = 0 + async for prompt_idx, res in result_generator: prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs prompt_text = res.prompt From ceacaddce4f203de19cbac7f1690ab014add2dee Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 14:08:37 +0000 Subject: [PATCH 067/132] working e2e with the fds --- vllm/v1/engine/async_llm.py | 82 +++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a300af75c2cf8..9c6206ddb5c31 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -206,50 +206,54 @@ async def generate( to the caller which iterates the AsyncGenerator. """ - # We start the output_handler on the first call to generate() so that - # we can call __init__ before the event loop starts, which enables us - # to handle startup failure gracefully in the OpenAI server. - # if self.output_handler is None: - if self.to_create_loop: - import signal - def signal_handler(self, signum=None, frame=None): - logger.warning( - f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..." + try: + # We start the output_handler on the first call to generate() so that + # we can call __init__ before the event loop starts, which enables us + # to handle startup failure gracefully in the OpenAI server. + # if self.output_handler is None: + if self.to_create_loop: + import signal + def signal_handler(self, signum=None, frame=None): + logger.warning( + f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..." + ) + + self.to_create_loop = False + loop = asyncio.get_event_loop() + loop.create_task(self._run_output_handler()) + loop.add_signal_handler(signal.SIGTERM, signal_handler) + + queue = await self.add_request( + request_id, + prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, ) - self.to_create_loop = False - loop = asyncio.get_event_loop() - loop.create_task(self._run_output_handler()) - loop.add_signal_handler(signal.SIGTERM, signal_handler) - - queue = await self.add_request( - request_id, - prompt, - sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, - priority=priority, - ) + while True: + try: + out = await asyncio.wait_for(queue.get(), timeout=4) - while True: - try: - out = await asyncio.wait_for(queue.get(), timeout=4) - - q_size = queue.qsize() - # if q_size > 0: - # logger.info(f"{q_size=}") - if out.finished: - del self.rid_to_queue[request_id] - yield out - break + q_size = queue.qsize() + # if q_size > 0: + # logger.info(f"{q_size=}") + if out.finished: + del self.rid_to_queue[request_id] + yield out + break - yield out + yield out - except asyncio.TimeoutError: - # TODO(rob): do request cancellation checking here. - # logger.debug("Timeout waiting for %s", request_id) - continue + except asyncio.TimeoutError: + # TODO(rob): do request cancellation checking here. + # logger.debug("Timeout waiting for %s", request_id) + continue + except BaseException as e: + logger.error(repr(e)) + raise e # async def _process_cancellations(self) -> None: From 630c72feea07861f10872d7c1b237ef525be4430 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 14:12:21 +0000 Subject: [PATCH 068/132] fix --- benchmarks/backend_request_func.py | 4 ---- vllm/entrypoints/openai/serving_completion.py | 9 ++++++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index fb3e7c994d4d6..b67849038cf0d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -295,7 +295,6 @@ async def async_request_openai_completions( if first_chunk_received: output.success = True else: - print("error 0!") output.success = False output.error = ( "Never received a valid chunk to calculate TTFT." @@ -303,15 +302,12 @@ async def async_request_openai_completions( output.generated_text = generated_text output.latency = latency else: - print("error 1!") output.error = response.reason or "" output.success = False except Exception: - print("error 2!") output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) - print(f"{output.error=}") if pbar: pbar.update(1) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index bd39a4c42e938..019a16ed654e5 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -159,8 +159,9 @@ async def create_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - result_generator = merge_async_iterators( - *generators, is_cancelled=raw_request.is_disconnected) + # result_generator = merge_async_iterators( + # *generators, is_cancelled=raw_request.is_disconnected) + result_generator = generator model_name = self._get_model_name(lora_request) num_prompts = len(engine_prompts) @@ -256,7 +257,9 @@ async def completion_stream_generator( include_usage, include_continuous_usage = False, False try: - async for prompt_idx, res in result_generator: + # async for prompt_idx, res in result_generator: + async for res in result_generator: + prompt_idx = 0 prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs prompt_text = res.prompt From e02101233b93d3c98743456bd8e6a2ac3b8dab67 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 14:20:52 +0000 Subject: [PATCH 069/132] updated --- vllm/v1/engine/async_llm.py | 83 +++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 9c6206ddb5c31..d16a33bbbbad3 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -206,55 +206,50 @@ async def generate( to the caller which iterates the AsyncGenerator. """ - try: - # We start the output_handler on the first call to generate() so that - # we can call __init__ before the event loop starts, which enables us - # to handle startup failure gracefully in the OpenAI server. - # if self.output_handler is None: - if self.to_create_loop: - import signal - def signal_handler(self, signum=None, frame=None): - logger.warning( - f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..." - ) - - self.to_create_loop = False - loop = asyncio.get_event_loop() - loop.create_task(self._run_output_handler()) - loop.add_signal_handler(signal.SIGTERM, signal_handler) - - queue = await self.add_request( - request_id, - prompt, - sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, - priority=priority, + # We start the output_handler on the first call to generate() so that + # we can call __init__ before the event loop starts, which enables us + # to handle startup failure gracefully in the OpenAI server. + # if self.output_handler is None: + if self.to_create_loop: + import signal + def signal_handler(self, signum=None, frame=None): + logger.warning( + f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..." ) - while True: - try: - out = await asyncio.wait_for(queue.get(), timeout=4) - - q_size = queue.qsize() - # if q_size > 0: - # logger.info(f"{q_size=}") - if out.finished: - del self.rid_to_queue[request_id] - yield out - break + self.to_create_loop = False + loop = asyncio.get_event_loop() + loop.create_task(self._run_output_handler()) + loop.add_signal_handler(signal.SIGTERM, signal_handler) + + queue = await self.add_request( + request_id, + prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, + ) + while True: + try: + out = await asyncio.wait_for(queue.get(), timeout=4) + + q_size = queue.qsize() + # if q_size > 0: + # logger.info(f"{q_size=}") + if out.finished: + del self.rid_to_queue[request_id] yield out + break + + yield out - except asyncio.TimeoutError: - # TODO(rob): do request cancellation checking here. - # logger.debug("Timeout waiting for %s", request_id) - continue - except BaseException as e: - logger.error(repr(e)) - raise e - + except asyncio.TimeoutError: + # TODO(rob): do request cancellation checking here. + # logger.debug("Timeout waiting for %s", request_id) + continue # async def _process_cancellations(self) -> None: # """ From c58d0ff348477b680e8dd6e10787af1f89f64c95 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 14:45:12 +0000 Subject: [PATCH 070/132] performance is now good --- vllm/v1/engine/async_llm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d16a33bbbbad3..b078b05369e6d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -234,16 +234,16 @@ def signal_handler(self, signum=None, frame=None): while True: try: - out = await asyncio.wait_for(queue.get(), timeout=4) + if queue.qsize() > 0: + out = queue.get_nowait() + else: + out = await asyncio.wait_for(queue.get(), timeout=4) - q_size = queue.qsize() - # if q_size > 0: - # logger.info(f"{q_size=}") if out.finished: del self.rid_to_queue[request_id] yield out break - + yield out except asyncio.TimeoutError: From d7af4bcc0af97bc870ee729fc4dfebb0b9cd985b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 14:59:54 +0000 Subject: [PATCH 071/132] updated --- vllm/v1/engine/async_llm.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b078b05369e6d..7b87b7ce2c819 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,5 +1,5 @@ import asyncio -from dataclasses import dataclass +import fastapi from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -234,6 +234,8 @@ def signal_handler(self, signum=None, frame=None): while True: try: + # Note: drain queue without awaiting if possible (this helps + # to avoid task switching under load + helps performance) if queue.qsize() > 0: out = queue.get_nowait() else: @@ -247,8 +249,7 @@ def signal_handler(self, signum=None, frame=None): yield out except asyncio.TimeoutError: - # TODO(rob): do request cancellation checking here. - # logger.debug("Timeout waiting for %s", request_id) + logger.debug("%s request timed out waiting", request_id) continue # async def _process_cancellations(self) -> None: @@ -308,7 +309,7 @@ async def _run_output_handler(self): async def abort(self, request_id: str) -> None: # Note: this is not used outside of testing. - raise ValueError("Not Supported on V1 yet.") + pass def encode( self, From 091435158fbb068662509bb455bfee48fe6d8b8f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 16:06:45 +0000 Subject: [PATCH 072/132] updated --- vllm/v1/engine/__init__.py | 26 +++--- vllm/v1/engine/async_llm.py | 137 ++++++++++++---------------- vllm/v1/engine/core.py | 41 +++------ vllm/v1/engine/core_client.py | 3 - vllm/v1/engine/detokenizer.py | 164 +++++++++++++++++++--------------- 5 files changed, 171 insertions(+), 200 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 9e4f8a9d6e29a..4e84d763ae9be 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -32,6 +32,17 @@ class EngineRequest: arrival_time: float lora_request: Optional[LoRARequest] +@dataclass +class EngineAbortRequest: + + request_ids: List[str] + +@dataclass +class EngineProfileRequest: + + is_start: bool + +EngineRequestUnion = Union[EngineRequest, EngineAbortRequest, EngineProfileRequest] class EngineCoreOutput( msgspec.Struct, @@ -59,19 +70,4 @@ class EngineCoreOutputs( outputs: List[EngineCoreOutput] -@dataclass -class EngineCoreProfile: - is_start: bool - - -class EngineRequestType(enum.Enum): - """ - Request types defined as hex byte strings, so it can be sent over sockets - without separate encoding step. - """ - ADD = b'\x00' - ABORT = b'\x01' - PROFILE = b'\x02' - -EngineRequestUnion = Union[EngineRequest, EngineCoreProfile, List[str]] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index f8b02ba7fa936..6e15665df61de 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -18,6 +18,7 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import get_open_zmq_ipc_path +from vllm.v1.engine import EngineAbortRequest from vllm.v1.engine.core_client import MultiprocessEngineCore from vllm.v1.engine.detokenizer import DetokenizerClient from vllm.v1.engine.processor import Processor @@ -92,7 +93,6 @@ def __init__( usage_context=usage_context, ) - # self.output_handler: Optional[asyncio.Task] = None self.to_create_loop = True def __del__(self): @@ -172,11 +172,11 @@ async def add_request( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 2) Create Queue (output_handler pushes, generate pulls) + # 2) Create Queue (output_handler() pushes, generate() pulls) self.rid_to_queue[request_id] = asyncio.Queue() - # 3) Send to Detokenizer. - await self.detokenizer.add_request_async(engine_request) + # 3) Send to Detokenizer (which forwards to EngineCore). + await self.detokenizer.input_socket.send_pyobj(engine_request) return self.rid_to_queue[request_id] @@ -197,93 +197,68 @@ async def generate( ) -> AsyncGenerator[RequestOutput, None]: """ Main function called by the API server to kick off a request - * 1) Make a queue corresponding to the Request. - # 2) Processing the Input. - * 3) Adding the Request to the Detokenize + EngineCore. + * 1) Make an output queue for the Request. + # 2) Processing the Input (e.g. Tokenizer). + * 3) Adding the Request to Detokenizer + EngineCore. The output_handler() loop runs in a background task, pulling from Detokenizer and pushing to the per request queue. - The generate() pulls from the per requests queue and yeilds + The generate() pulls from the per request queue and yeilds to the caller which iterates the AsyncGenerator. """ - # We start the output_handler on the first call to generate() so that - # we can call __init__ before the event loop starts, which enables us - # to handle startup failure gracefully in the OpenAI server. - if self.to_create_loop: - import signal - def signal_handler(self, signum=None, frame=None): - logger.warning( - f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..." + try: + # We start the output_handler on the first call to generate() so that + # we can call __init__ before the event loop starts, which enables us + # to handle startup failure gracefully in the OpenAI server. + if self.to_create_loop: + import signal + def signal_handler(self, signum=None, frame=None): + logger.warning( + f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..." + ) + + self.to_create_loop = False + loop = asyncio.get_event_loop() + loop.create_task(self.output_handler()) + loop.add_signal_handler(signal.SIGTERM, signal_handler) + + q = await self.add_request( + request_id, + prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, ) - self.to_create_loop = False - loop = asyncio.get_event_loop() - loop.create_task(self._run_output_handler()) - loop.add_signal_handler(signal.SIGTERM, signal_handler) - - queue = await self.add_request( - request_id, - prompt, - sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, - priority=priority, - ) - - while True: - try: - # Note: drain queue without awaiting if possible (this helps - # to avoid task switching under load + helps performance). - if queue.qsize() > 0: - out = queue.get_nowait() - else: - out = await asyncio.wait_for(queue.get(), timeout=4) + # The output_handler task pushes items into the queue. + # This task pulls from the queue and yields them. + while True: + # Note: drain queue without await if possible (avoids + # task switching under load --> helps performance). + out = q.get_nowait() if q.qsize() > 0 else await q.get() + # Note: both Detokenizer and EngineCore handle their + # own cleanup based on finished. if out.finished: del self.rid_to_queue[request_id] yield out break - - yield out - - except asyncio.TimeoutError: - logger.debug("%s request timed out waiting", request_id) - continue - - # async def _process_cancellations(self) -> None: - # """ - # Process requests cancelled from user disconnecting. - - # When a client disconnects, AsyncStream._cancel() is called. - # We passed a callback to AsyncStream(), which appends to - # self.client_aborted_requests. - # As a result, if any requests are canceled from the user side - # the request_id will show up in self.client_aborted_requests. - # """ - - # # Avoid streams having circular ref to parent AsyncLLM object. - # if not self.client_aborted_requests: - # return - # reqs_to_abort = self.client_aborted_requests.copy() - # self.client_aborted_requests.clear() - - # # Remove from Detokenizer. - # self.detokenizer.abort_requests(reqs_to_abort) - - # # Remove from RequestStreams. - # for request_id in reqs_to_abort: - # if self.log_requests: - # logger.info("User-cancelled request %s.", request_id) - # self._finish_stream(request_id) + yield out + + # Client request cancellation is handled through calling + # task.cancel() on generate. So we abort to alert the Detokenizer + # and the EngineCore. + except asyncio.CancelledError: + await self.abort(request_id) + raise - # # Remove from EngineCore. - # await self.engine_core.abort_requests_async(reqs_to_abort) - async def _run_output_handler(self): + async def output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" epoch = 0 @@ -301,16 +276,14 @@ async def _run_output_handler(self): self.rid_to_queue[out.request_id].put_nowait(out) - # 3) Abort any requests that finished due to stop strings. - # await self.engine_core.abort_requests_async(reqs_to_abort) - # 4) Abort any requests due to client cancellations. - # TODO: send back to detokenizer if this fails. - # await self._process_cancellations() + async def abort(self, request_id: str): - async def abort(self, request_id: str) -> None: - # Note: this is not used outside of testing. - pass + await self.detokenizer.input_socket.send_pyobj( + EngineAbortRequest([request_id])) + + if self.log_requests: + logger.info("Aborted %s.", request_id) def encode( self, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5f9674f2569c3..67920688f0031 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -18,8 +18,8 @@ from vllm.utils import get_open_zmq_ipc_path from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineRequest, - EngineRequestType, EngineRequestUnion, + EngineAbortRequest, EngineRequest, + EngineProfileRequest, EngineRequestUnion, BackgroundProcHandle) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor @@ -92,6 +92,8 @@ def _initialize_kv_caches(self, def add_request(self, request: EngineRequest): """Add request to the scheduler.""" + logger.debug("Adding request: %s", request.request_id) + if request.mm_hashes is not None: # Here, if hash exists for an image, then it will be fetched # from the cache, else it will be added to the cache. @@ -102,16 +104,15 @@ def add_request(self, request: EngineRequest): request.mm_inputs = self.mm_input_mapper_server.process_inputs( request.mm_inputs, request.mm_hashes) + # TODO: instead of sending EngineRequest, should we just send + # around Request? req = Request.from_engine_core_request(request) - self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): """Abort requests from the scheduler.""" - # TODO: The scheduler doesn't really need to know the - # specific finish reason, TBD whether we propagate that - # (i.e. client-aborted vs stop criteria met). + logger.debug("Aborting requests: %s", request_ids) self.scheduler.finish_requests(request_ids, RequestStatus.FINISHED_ABORTED) @@ -294,38 +295,18 @@ def _handle_client_request(self, request: EngineRequestUnion) -> None: if isinstance(request, EngineRequest): self.add_request(request) - elif isinstance(request, EngineCoreProfile): + elif isinstance(request, EngineProfileRequest): self.model_executor.profile(request.is_start) + elif isinstance(request, EngineAbortRequest): + self.abort_requests(request.request_ids) else: - # TODO: make an EngineCoreAbort wrapper - assert isinstance(request, list) - self.abort_requests(request) + raise ValueError("Unknown request type: {request}") def process_input_socket(self, input_path: str): """Input socket IO thread.""" - # Msgpack serialization decoding. - decoder_add_req = PickleEncoder() - decoder_abort_req = PickleEncoder() - with zmq_socket_ctx(input_path, zmq.PULL) as socket: while True: - # (RequestType, RequestData) - # type_frame, data_frame = socket.recv_multipart(copy=False) - # request_type = type_frame.buffer - # request_data = data_frame.buffer - - - # # Deserialize the request data. - # if request_type == EngineRequestType.ADD.value: - # request = decoder_add_req.decode(request_data) - # elif request_type == EngineRequestType.ABORT.value: - # request = decoder_abort_req.decode(request_data) - # elif request_type == EngineRequestType.PROFILE.value: - # request = pickle.loads(request_data) - # else: - # raise ValueError(f"Unknown RequestType: {request_type}") - # Push to input queue for core busy loop. request = socket.recv_pyobj() self.input_queue.put_nowait(request) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index b328d7337b18a..07f0ae2c9059f 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -142,9 +142,6 @@ def __init__( self._finalizer = weakref.finalize(self, self.shutdown) def shutdown(self): - # Shut down the zmq context. - self.ctx.destroy(linger=0) - if hasattr(self, "proc_handle") and self.proc_handle: # Shutdown the process if needed. if self.proc_handle.proc.is_alive(): diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 6f03ec57e1105..ab06d811b1c70 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -3,7 +3,7 @@ import msgspec import signal from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Union +from typing import Dict, Iterable, List, Optional, Tuple,Union from vllm.engine.output_processor.stop_checker import StopChecker from vllm.executor.multiproc_worker_utils import get_mp_context @@ -15,7 +15,8 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import get_open_zmq_ipc_path, kill_process_tree from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - BackgroundProcHandle, EngineRequest) + BackgroundProcHandle, + EngineRequest, EngineAbortRequest) from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, wait_for_startup) @@ -234,14 +235,14 @@ def abort_requests( self.request_states.pop(request_id, None) def step( - self, encore_core_outputs: List[EngineCoreOutput] - ) -> List[RequestOutput]: - """Update state and request the RequestOutputs to the LLMEngine.""" + self, encore_core_outputs: EngineCoreOutputs, + ) -> Tuple[List[RequestOutput], List[str]]: + """Update state and make RequestOutputs for the LLMEngine.""" request_outputs: List[RequestOutput] = [] - # requests_to_abort: List[str] = [] + requests_to_abort: List[str] = [] - for engine_core_output in encore_core_outputs: + for engine_core_output in encore_core_outputs.outputs: request_id = engine_core_output.request_id detokenizer = self.request_states.get(request_id) @@ -261,17 +262,16 @@ def step( request_outputs.append(request_output) # # Free completed requests. - # if request_output.finished: - # self.request_states.pop(request_id) - # # If Request finished but EngineCore not finished, - # # this was caused by a stop string + we need to send - # # an abort signal to the EngineCore. - # if not engine_core_output.finished: - # requests_to_abort.append(request_id) + if request_output.finished: + self.request_states.pop(request_id) + # If Request finished but EngineCore not finished, + # this was caused by a stop string + we need to send + # an abort signal to the EngineCore. + if not engine_core_output.finished: + requests_to_abort.append(request_id) # Return to EngineClient. - # return request_outputs, requests_to_abort - return request_outputs, [] + return request_outputs, requests_to_abort class DetokenizerProc(Detokenizer): """ZMQ-wrapper for running Detokenizer in background process.""" @@ -374,60 +374,95 @@ def signal_handler(signum, frame): if detokenizer is not None: detokenizer = None - def run_busy_loop(self): - """Core busy loop of the Detokenizer.""" + def _handle_from_llm_engine( + self, + from_llm_engine: zmq.Socket, + to_engine_core: zmq.Socket, + ) -> None: + """Handle EngineRequest from the LLMEngine.""" + + pickled_req = from_llm_engine.recv() + req = pickle.loads(pickled_req) + + # Request added by client, add to RequestStates. + if isinstance(req, EngineRequest): + if req.request_id in self.request_states: + raise ValueError( + f"{req.request_id} already in Request States!") + + # Add to RequestStates. + request_state = IncrementalDetokenizer.from_new_request( + self.tokenizer, req) + self.request_states[req.request_id] = request_state + + # Request aborted by client, delete from RequestStates. + elif isinstance(req, EngineAbortRequest): + if req.request_id not in self.request_states: + # If not found, the request is already completed + # and we can safely ignore. + pass + del self.request_states[req.request_id] + + else: + raise ValueError(f"Unknown type: {req}") - try: - # TODO: handle aborted due to client cancellation - # TODO: pickle -> msgpack - # TODO: send stop string aborts back to EngineCore directly + # Forward to EngineCore. + to_engine_core.send(pickled_req) + + def _handle_from_engine_core( + self, + from_engine_core: zmq.Socket, + to_engine_core: zmq.Socket, + to_llm_engine: zmq.Socket, + decoder: msgspec.msgpack.Decoder, + ) -> None: + """Handle Outputs from the EngineCore.""" - decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs) + # Deserialize the EngineOutput (use msgpack for performance). + (frame, ) = from_engine_core.recv_multipart(copy=False) + outputs: EngineCoreOutputs = decoder.decode(frame.buffer) - with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, - zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine, - zmq_socket_ctx(self.engine_core_inputs_path, zmq.PUSH) as to_engine_core, - zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine): + # Detokenize. + request_outputs, requests_to_abort = self.step(outputs.outputs) - # TODO: avoid poll by having both EngineCore - # and AsyncLLM send to the same socket (unclear why this - # was not working when I originally tried it) - poller = zmq.Poller() - poller.register(from_engine_core, zmq.POLLIN) - poller.register(from_llm_engine, zmq.POLLIN) + # Send request outputs back to LLMEngine. + to_llm_engine.send_pyobj(request_outputs) - epoch = 0 - while True: - logger.info(f"EPOCH: {epoch}") + # Abort requests that finished due to stop strings. + to_engine_core.send_pyobj(EngineAbortRequest(requests_to_abort)) + - socks = dict(poller.poll()) + def run_busy_loop(self): + """Core busy loop of the Detokenizer.""" - # Handle NewRequest. - if from_llm_engine in socks: - pickled_request = from_llm_engine.recv() - request: EngineRequest = pickle.loads(pickled_request) + decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) - assert (request.request_id not in self.request_states) + with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, + zmq_socket_ctx(self.engine_core_inputs_path, zmq.PUSH) as to_engine_core, + zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine, + zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine): - # Add to Detokenizer. - request_state = IncrementalDetokenizer.from_new_request(self.tokenizer, request) - self.request_states[request.request_id] = request_state + # TODO(rob): avoid poll by having both EngineCore and + # LLMEngine send to the same socket. + poller = zmq.Poller() + poller.register(from_engine_core, zmq.POLLIN) + poller.register(from_llm_engine, zmq.POLLIN) - # Forward to EngineCore. - to_engine_core.send(pickled_request) + epoch = 0 + while True: + logger.info(f"EPOCH: {epoch}") + socks = dict(poller.poll()) - # Handle EngineCoreOutput. - if from_engine_core in socks: - epoch += 1 + # Handle input from LLMEngine. + if from_llm_engine in socks: + self._handle_from_llm_engine( + from_llm_engine, to_engine_core) - (frame, ) = from_engine_core.recv_multipart(copy=False) - engine_core_outputs = decoder_out.decode(frame.buffer).outputs - request_outputs, _ = self.step(engine_core_outputs) - to_llm_engine.send_pyobj(request_outputs) - - except Exception as e: - logger.error(e) - raise e + # Handle output from EngineCoreOutput. + if from_engine_core in socks: + epoch += 1 + self._handle_from_engine_core( + from_engine_core, to_llm_engine, decoder) class DetokenizerClient: @@ -450,8 +485,7 @@ def __init__(self, # Get output (RequestOutput) from Detokenizer. output_path = get_open_zmq_ipc_path() - self.output_socket = make_zmq_socket( - self.ctx, + self.output_socket = make_zmq_socket(self.ctx, output_path, zmq.PULL, ) @@ -473,13 +507,3 @@ def shutdown(self): if self.proc_handle.proc.is_alive(): kill_process_tree(self.proc_handle.proc.pid) - - async def add_request_async(self, request: EngineRequest): - """Send new DetokenizerRequest to Detokenizer.""" - - await self.input_socket.send_pyobj(request) - - async def get_output_async(self) -> List[RequestOutput]: - """Get RequestOutputs, RequestsToAbort from Detokenizer.""" - - return await self.output_socket.recv_pyobj() From 28da5b311033a83064263c60b5b33901a5df1a4f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 20:31:22 +0000 Subject: [PATCH 073/132] updated --- vllm/v1/engine/async_llm.py | 141 +++++++++++++++++++++++----------- vllm/v1/engine/core.py | 2 +- vllm/v1/engine/core_client.py | 8 +- vllm/v1/engine/detokenizer.py | 16 ++-- 4 files changed, 112 insertions(+), 55 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6e15665df61de..94d542fa096d7 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,5 +1,22 @@ +# Copyright 2033-2024 The vLLM team. +# Copyright 2023-2024 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py + import asyncio -import fastapi +import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -43,32 +60,13 @@ def __init__( ) -> None: assert start_engine_loop - self.warned = False self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config - # Tokenizer (+ ensure liveness if running in another process). - self.tokenizer = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - parallel_config=vllm_config.parallel_config, - lora_config=vllm_config.lora_config) - self.tokenizer.ping() - # RequestId -> OutputQueue. self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {} - # List of cancelled request ids to be aborted. - self.client_aborted_requests: List[str] = [] - - # Processor (converts Inputs --> EngineRequest). - self.processor = Processor( - model_config=vllm_config.model_config, - cache_config=vllm_config.cache_config, - lora_config=vllm_config.lora_config, - tokenizer=self.tokenizer, - input_registry=input_registry) # IPC paths. engine_core_outputs_path = get_open_zmq_ipc_path() @@ -93,7 +91,28 @@ def __init__( usage_context=usage_context, ) + # Tokenizer (+ ensure liveness if running in another process). + # Note: make last to avoid fork before using tokenizers + # and avoid TOKENIZERS_PARALLELISM issues. + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + parallel_config=vllm_config.parallel_config, + lora_config=vllm_config.lora_config) + self.tokenizer.ping() + + # Processor (converts Inputs --> EngineRequest). + self.processor = Processor( + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + tokenizer=self.tokenizer, + input_registry=input_registry) + + # Create output handler loop during first call to generate(). self.to_create_loop = True + self.gracefully_exit = False + self.asyncio_tasks = set() def __del__(self): self.shutdown() @@ -137,9 +156,6 @@ def shutdown(self): if detokenizer := getattr(self, "detokenizer", None): detokenizer.shutdown() - if handler := getattr(self, "output_handler", None): - handler.cancel() - @classmethod def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: executor_class: Type[Executor] @@ -209,20 +225,9 @@ async def generate( """ try: - # We start the output_handler on the first call to generate() so that - # we can call __init__ before the event loop starts, which enables us - # to handle startup failure gracefully in the OpenAI server. + # Start output_handler on first request. if self.to_create_loop: - import signal - def signal_handler(self, signum=None, frame=None): - logger.warning( - f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..." - ) - - self.to_create_loop = False - loop = asyncio.get_event_loop() - loop.create_task(self.output_handler()) - loop.add_signal_handler(signal.SIGTERM, signal_handler) + self.create_output_handler() q = await self.add_request( request_id, @@ -235,14 +240,14 @@ def signal_handler(self, signum=None, frame=None): ) # The output_handler task pushes items into the queue. - # This task pulls from the queue and yields them. + # This task pulls from the queue and yields to caller. while True: # Note: drain queue without await if possible (avoids - # task switching under load --> helps performance). + # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() # Note: both Detokenizer and EngineCore handle their - # own cleanup based on finished. + # own request cleanup based on finished. if out.finished: del self.rid_to_queue[request_id] yield out @@ -251,15 +256,47 @@ def signal_handler(self, signum=None, frame=None): yield out # Client request cancellation is handled through calling - # task.cancel() on generate. So we abort to alert the Detokenizer - # and the EngineCore. + # task.cancel() on generate. So if we get this error, we + # need to abort the request. except asyncio.CancelledError: await self.abort(request_id) raise + + def create_output_handler(self): + """Creates output handler loop. Called on first generate().""" + + self.to_create_loop = False + loop = asyncio.get_event_loop() + + # Start output handler. + self.asyncio_tasks.add(loop.create_task(self.output_handler())) + + # Start signal handlers for shutdown. + signal_handler = SignalHandler(self) + loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler) + self.asyncio_tasks.add(loop.create_task(self.sigterm_watchdog())) + + + async def sigterm_watchdog(self): + """Handle shutdown from sigterm.""" + + while not self.gracefully_exit: + await asyncio.sleep(5) + # Drain requests + while True: + remain_num_req = len(self.rid_to_state) + logger.info( + f"Gracefully exiting... remaining number of requests {remain_num_req}" + ) + if remain_num_req > 0: + await asyncio.sleep(5) + else: + break + self.shutdown() async def output_handler(self): - """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + """Background loop: pulls from Detokenizer and pushes to queues.""" epoch = 0 while True: @@ -269,6 +306,7 @@ async def output_handler(self): # 1) Pull outputs from the Detokenizer. outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj() + # 2) Put each output into a per request Queue. for out in outputs: if out.request_id not in self.rid_to_queue: raise RuntimeError(f"{out.request_id} " @@ -278,9 +316,14 @@ async def output_handler(self): async def abort(self, request_id: str): - + # Remove from Detokenizer and EngineCore (Detokenizer + # forwards the message to EngineCore). await self.detokenizer.input_socket.send_pyobj( EngineAbortRequest([request_id])) + + # Remove from request output queues. + if request_id in self.rid_to_queue: + del self.rid_to_queue[request_id] if self.log_requests: logger.info("Aborted %s.", request_id) @@ -345,3 +388,15 @@ def errored(self) -> bool: @property def dead_error(self) -> BaseException: return Exception() # TODO: implement + + +class SignalHandler: + def __init__(self, async_llm): + self.async_llm = async_llm + + def signal_handler(self, signum=None, frame=None): + logger.warning( + "SIGTERM received. signum=%s frame=%s. Draining " + "requests and shutting down...", signum, frame, + ) + self.async_llm.gracefully_exit = True diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 67920688f0031..20ed8c93e11df 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -243,7 +243,7 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the EngineCore.""" - + # Loop until process is sent a SIGINT or SIGTERM epoch = 0 while True: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 07f0ae2c9059f..5829ef350f438 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -8,13 +8,9 @@ from vllm.logger import init_logger from vllm.utils import kill_process_tree, get_open_zmq_ipc_path -from vllm.v1.engine import (BackgroundProcHandle, - EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineRequest, - EngineRequestType, EngineRequestUnion) +from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, + EngineRequest) from vllm.v1.engine.core import (EngineCore, EngineCoreProc) -from vllm.v1.serial_utils import PickleEncoder -from vllm.v1.utils import make_zmq_socket logger = init_logger(__name__) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index ab06d811b1c70..b243e787024e2 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -420,10 +420,10 @@ def _handle_from_engine_core( # Deserialize the EngineOutput (use msgpack for performance). (frame, ) = from_engine_core.recv_multipart(copy=False) - outputs: EngineCoreOutputs = decoder.decode(frame.buffer) + outputs: EngineCoreOutputs = decoder.decode(frame.buffer) # Detokenize. - request_outputs, requests_to_abort = self.step(outputs.outputs) + request_outputs, requests_to_abort = self.step(outputs) # Send request outputs back to LLMEngine. to_llm_engine.send_pyobj(request_outputs) @@ -450,19 +450,25 @@ def run_busy_loop(self): epoch = 0 while True: - logger.info(f"EPOCH: {epoch}") socks = dict(poller.poll()) # Handle input from LLMEngine. if from_llm_engine in socks: self._handle_from_llm_engine( - from_llm_engine, to_engine_core) + from_llm_engine=from_llm_engine, + to_engine_core=to_engine_core, + ) # Handle output from EngineCoreOutput. if from_engine_core in socks: + logger.info(f"EPOCH: {epoch}") epoch += 1 self._handle_from_engine_core( - from_engine_core, to_llm_engine, decoder) + from_engine_core=from_engine_core, + to_engine_core=to_engine_core, + to_llm_engine=to_llm_engine, + decoder=decoder, + ) class DetokenizerClient: From e14def69c99bb3afbb13963e8ae8a94b93d2acc1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 20:31:54 +0000 Subject: [PATCH 074/132] updated --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 94d542fa096d7..472777bacfcb7 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -152,7 +152,7 @@ def shutdown(self): if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() - + if detokenizer := getattr(self, "detokenizer", None): detokenizer.shutdown() From 074af11ee70ed3d75c54a1bbe5842692f88b53aa Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 20:33:19 +0000 Subject: [PATCH 075/132] updated --- vllm/v1/engine/detokenizer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index b243e787024e2..ef14511754a66 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -241,10 +241,8 @@ def step( request_outputs: List[RequestOutput] = [] requests_to_abort: List[str] = [] - for engine_core_output in encore_core_outputs.outputs: request_id = engine_core_output.request_id - detokenizer = self.request_states.get(request_id) if detokenizer is None: # Ignore output for already-aborted request. @@ -256,12 +254,12 @@ def step( finish_reason=engine_core_output.finish_reason, stop_reason=engine_core_output.stop_reason, ) - + if request_output is not None: # Add to RequestOutputs list. request_outputs.append(request_output) - # # Free completed requests. + # Free completed requests. if request_output.finished: self.request_states.pop(request_id) # If Request finished but EngineCore not finished, From 3df5288e7c824ad01eaa770b58e4df61e9f4a2e3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 20:34:13 +0000 Subject: [PATCH 076/132] updated --- examples/openai_completion_client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index 1f8b82bc5c9e9..58519f978d340 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -2,7 +2,7 @@ # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8001/v1" +openai_api_base = "http://localhost:8000/v1" client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") @@ -14,12 +14,14 @@ model = models.data[0].id # Completion API -stream = True +stream = False completion = client.completions.create( model=model, prompt="A robot may not injure a human being", echo=False, - stream=stream) + n=2, + stream=stream, + logprobs=3) print("Completion results:") if stream: From 546b0de8b1454676be80e3b26032019d4e35b8c7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 20:36:01 +0000 Subject: [PATCH 077/132] updated --- vllm/entrypoints/openai/serving_completion.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 2bc0c6d1f1c8f..af5987fcebd6c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -168,8 +168,7 @@ async def create_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - # result_generator = merge_async_iterators(*generators) - result_generator = generator + result_generator = merge_async_iterators(*generators) model_name = self._get_model_name(lora_request) num_prompts = len(engine_prompts) @@ -265,8 +264,7 @@ async def completion_stream_generator( include_usage, include_continuous_usage = False, False try: - # async for prompt_idx, res in result_generator: - async for res in result_generator: + async for prompt_idx, res in result_generator: prompt_idx = 0 prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs From c700c4a5ea693fc7baa5a27e212ec053b3e4bc2e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 22 Dec 2024 20:36:22 +0000 Subject: [PATCH 078/132] remove --- vllm/entrypoints/openai/serving_completion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index af5987fcebd6c..aaad7b8c7f44c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -265,7 +265,6 @@ async def completion_stream_generator( try: async for prompt_idx, res in result_generator: - prompt_idx = 0 prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs prompt_text = res.prompt From 5b568daa617803f7a1429d8b0c39136af69d5b2a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 14:37:10 +0000 Subject: [PATCH 079/132] send messages only when needed --- vllm/v1/engine/core.py | 3 ++- vllm/v1/engine/core_client.py | 4 ---- vllm/v1/engine/detokenizer.py | 4 +++- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 20ed8c93e11df..76cdd027ec319 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -272,7 +272,8 @@ def run_busy_loop(self): outputs = self.step() # 4) Put EngineCoreOutputs into the output queue. - self.output_queue.put_nowait(outputs) + if len(outputs) > 0: + self.output_queue.put_nowait(outputs) self._log_stats() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 5829ef350f438..181d6e3874d70 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -2,10 +2,6 @@ import weakref from typing import List, Optional -import msgspec -import zmq -import zmq.asyncio - from vllm.logger import init_logger from vllm.utils import kill_process_tree, get_open_zmq_ipc_path from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index ef14511754a66..ff047284c05eb 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -427,7 +427,9 @@ def _handle_from_engine_core( to_llm_engine.send_pyobj(request_outputs) # Abort requests that finished due to stop strings. - to_engine_core.send_pyobj(EngineAbortRequest(requests_to_abort)) + if len(requests_to_abort) > 0: + to_engine_core.send_pyobj( + EngineAbortRequest(requests_to_abort)) def run_busy_loop(self): From 93c4ea4517e26654d459edaf8d0d9d1744ef4abc Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 15:48:16 +0000 Subject: [PATCH 080/132] added flag for request id headers --- vllm/entrypoints/api_server.py | 4 +++- vllm/entrypoints/openai/api_server.py | 22 +++++++++++++++------- vllm/entrypoints/openai/cli_args.py | 6 +++++- vllm/utils.py | 17 +++++++++++++++++ vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/detokenizer.py | 4 ++-- 6 files changed, 43 insertions(+), 12 deletions(-) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 95da1c6e7b9bf..09983d9561532 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -21,7 +21,7 @@ from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, random_uuid +from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit from vllm.version import __version__ as VLLM_VERSION logger = init_logger("vllm.entrypoints.api_server") @@ -118,6 +118,8 @@ async def run_server(args: Namespace, **uvicorn_kwargs: Any) -> None: logger.info("vLLM API server version %s", VLLM_VERSION) logger.info("args: %s", args) + + set_ulimit() app = await init_app(args, llm_engine) assert engine is not None diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 73a4dc1c51185..67419accd9b60 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -64,7 +64,7 @@ from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address) + is_valid_ipv6_address, set_ulimit) from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -533,12 +533,17 @@ async def authentication(request: Request, call_next): status_code=401) return await call_next(request) - # @app.middleware("http") - # async def add_request_id(request: Request, call_next): - # request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex - # response = await call_next(request) - # response.headers["X-Request-Id"] = request_id - # return response + if args.enable_request_id_headers: + logger.warning( + "CAUTION: Enabling X-Request-Id headers in the API Server. " + "This can harm performance at high QPS.") + + @app.middleware("http") + async def add_request_id(request: Request, call_next): + request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex + response = await call_next(request) + response.headers["X-Request-Id"] = request_id + return response for middleware in args.middleware: module_path, object_name = middleware.rsplit(".", 1) @@ -662,6 +667,9 @@ async def run_server(args, **uvicorn_kwargs) -> None: sock_addr = (args.host or "", args.port) sock = create_server_socket(sock_addr) + # workaround to ensure user has enough fds available for uvicorn + ipc + set_ulimit() + def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing raise KeyboardInterrupt("terminated") diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 24c206a1261f2..908f8c3532c9e 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -196,7 +196,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action="store_true", help="If specified, will run the OpenAI frontend server in the same " "process as the model serving engine.") - + parser.add_argument( + "--enable-request-id-headers", + action="store_true", + help="If specified, API server will add X-Request-Id header to " + "responses. Caution: this hurts performance at high QPS.") parser.add_argument( "--enable-auto-tool-choice", action="store_true", diff --git a/vllm/utils.py b/vllm/utils.py index 1b90eca1cd6cc..fd9ca5984c4e0 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -10,6 +10,7 @@ import inspect import ipaddress import os +import resource import signal import socket import subprocess @@ -1613,6 +1614,22 @@ def resolve_obj_by_qualname(qualname: str) -> Any: return getattr(module, obj_name) +# Taken from https://github.com/sgl-project/sglang/blob/23e5e50fd5fba7f315e04294f55060a8171fcc69/python/sglang/srt/utils.py#L630 # noqa: E501 +def set_ulimit(target_soft_limit=65535): + resource_type = resource.RLIMIT_NOFILE + current_soft, current_hard = resource.getrlimit(resource_type) + + if current_soft < target_soft_limit: + try: + resource.setrlimit(resource_type, (target_soft_limit, current_hard)) + except ValueError as e: + logger.warning( + "Found ulimit of %s and failed to automatically increase" + "with error %s. This can cause fd limit errors like" + "`OSError: [Errno 24] Too many open files`. Consider " + "increasing with ulimit -n", current_soft, e) + + def kill_process_tree(pid: int): """ Kills all descendant processes of the given pid by sending SIGKILL. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 472777bacfcb7..fc248ccfc9be0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -301,7 +301,7 @@ async def output_handler(self): epoch = 0 while True: logger.info(f"EPOCH: {epoch}") - epoch+=1 + epoch += 1 # 1) Pull outputs from the Detokenizer. outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj() diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index ff047284c05eb..8e050e4970bf6 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -14,7 +14,7 @@ AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import get_open_zmq_ipc_path, kill_process_tree -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, +from vllm.v1.engine import (EngineCoreOutputs, BackgroundProcHandle, EngineRequest, EngineAbortRequest) from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, @@ -450,6 +450,7 @@ def run_busy_loop(self): epoch = 0 while True: + logger.info(f"EPOCH: {epoch}") socks = dict(poller.poll()) # Handle input from LLMEngine. @@ -461,7 +462,6 @@ def run_busy_loop(self): # Handle output from EngineCoreOutput. if from_engine_core in socks: - logger.info(f"EPOCH: {epoch}") epoch += 1 self._handle_from_engine_core( from_engine_core=from_engine_core, From 548ae691d60420dd8f17c6dcc10127a5378864d2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 15:49:41 +0000 Subject: [PATCH 081/132] fixed too long line --- benchmarks/benchmark_throughput.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 5010bf1988e9b..db7724e1d707c 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -414,6 +414,7 @@ def main(args: argparse.Namespace): for request in requests) total_output_tokens = sum(request.expected_output_len for request in requests) + total_input_tokens = total_num_tokens - total_output_tokens if is_multi_modal: print("\033[91mWARNING\033[0m: Multi-modal request detected. The " "following metrics are not accurate because image tokens are not" @@ -422,7 +423,7 @@ def main(args: argparse.Namespace): print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s, " - f"{(total_num_tokens - total_output_tokens) / len(requests)} input tokens/req, " + f"{total_input_tokens / len(requests)} input tokens/req, " f"{(total_output_tokens) / len(requests)} output tokens/req, " ) From 729df02702f6c5e955cc4ce1c2b113a7e7d0c977 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 15:55:51 +0000 Subject: [PATCH 082/132] updated --- benchmarks/benchmark_throughput.py | 3 +-- vllm/entrypoints/api_server.py | 2 +- vllm/entrypoints/openai/api_server.py | 3 ++- vllm/utils.py | 3 ++- vllm/v1/engine/__init__.py | 10 +++++--- vllm/v1/engine/async_llm.py | 35 ++++++++++++++------------- vllm/v1/engine/core.py | 2 +- vllm/v1/engine/core_client.py | 2 +- vllm/v1/utils.py | 11 +++------ 9 files changed, 36 insertions(+), 35 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index db7724e1d707c..990548c247822 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -424,8 +424,7 @@ def main(args: argparse.Namespace): f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s, " f"{total_input_tokens / len(requests)} input tokens/req, " - f"{(total_output_tokens) / len(requests)} output tokens/req, " - ) + f"{(total_output_tokens) / len(requests)} output tokens/req, ") # Output JSON results if specified if args.output_json: diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 09983d9561532..daefbff7e5178 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -118,7 +118,7 @@ async def run_server(args: Namespace, **uvicorn_kwargs: Any) -> None: logger.info("vLLM API server version %s", VLLM_VERSION) logger.info("args: %s", args) - + set_ulimit() app = await init_app(args, llm_engine) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 67419accd9b60..2aa666548bec5 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -540,7 +540,8 @@ async def authentication(request: Request, call_next): @app.middleware("http") async def add_request_id(request: Request, call_next): - request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex + request_id = request.headers.get( + "X-Request-Id") or uuid.uuid4().hex response = await call_next(request) response.headers["X-Request-Id"] = request_id return response diff --git a/vllm/utils.py b/vllm/utils.py index fd9ca5984c4e0..2a8af5ea2d5c5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1621,7 +1621,8 @@ def set_ulimit(target_soft_limit=65535): if current_soft < target_soft_limit: try: - resource.setrlimit(resource_type, (target_soft_limit, current_hard)) + resource.setrlimit(resource_type, + (target_soft_limit, current_hard)) except ValueError as e: logger.warning( "Found ulimit of %s and failed to automatically increase" diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 4e84d763ae9be..7200aa9a208ee 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -32,17 +32,22 @@ class EngineRequest: arrival_time: float lora_request: Optional[LoRARequest] + @dataclass class EngineAbortRequest: request_ids: List[str] + @dataclass class EngineProfileRequest: is_start: bool -EngineRequestUnion = Union[EngineRequest, EngineAbortRequest, EngineProfileRequest] + +EngineRequestUnion = Union[EngineRequest, EngineAbortRequest, + EngineProfileRequest] + class EngineCoreOutput( msgspec.Struct, @@ -68,6 +73,3 @@ class EngineCoreOutputs( # [num_reqs] outputs: List[EngineCoreOutput] - - - diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fc248ccfc9be0..81d65aec2892f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -102,12 +102,11 @@ def __init__( self.tokenizer.ping() # Processor (converts Inputs --> EngineRequest). - self.processor = Processor( - model_config=vllm_config.model_config, - cache_config=vllm_config.cache_config, - lora_config=vllm_config.lora_config, - tokenizer=self.tokenizer, - input_registry=input_registry) + self.processor = Processor(model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + tokenizer=self.tokenizer, + input_registry=input_registry) # Create output handler loop during first call to generate(). self.to_create_loop = True @@ -187,7 +186,7 @@ async def add_request( engine_request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - + # 2) Create Queue (output_handler() pushes, generate() pulls) self.rid_to_queue[request_id] = asyncio.Queue() @@ -254,14 +253,14 @@ async def generate( break yield out - + # Client request cancellation is handled through calling # task.cancel() on generate. So if we get this error, we # need to abort the request. except asyncio.CancelledError: await self.abort(request_id) raise - + def create_output_handler(self): """Creates output handler loop. Called on first generate().""" @@ -276,7 +275,6 @@ def create_output_handler(self): loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler) self.asyncio_tasks.add(loop.create_task(self.sigterm_watchdog())) - async def sigterm_watchdog(self): """Handle shutdown from sigterm.""" @@ -294,7 +292,6 @@ async def sigterm_watchdog(self): break self.shutdown() - async def output_handler(self): """Background loop: pulls from Detokenizer and pushes to queues.""" @@ -304,17 +301,18 @@ async def output_handler(self): epoch += 1 # 1) Pull outputs from the Detokenizer. - outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj() + outputs: List[ + RequestOutput] = await self.detokenizer.output_socket.recv_pyobj( + ) # 2) Put each output into a per request Queue. for out in outputs: if out.request_id not in self.rid_to_queue: raise RuntimeError(f"{out.request_id} " - "not in RequestStates") + "not in RequestStates") self.rid_to_queue[out.request_id].put_nowait(out) - async def abort(self, request_id: str): # Remove from Detokenizer and EngineCore (Detokenizer # forwards the message to EngineCore). @@ -324,7 +322,7 @@ async def abort(self, request_id: str): # Remove from request output queues. if request_id in self.rid_to_queue: del self.rid_to_queue[request_id] - + if self.log_requests: logger.info("Aborted %s.", request_id) @@ -391,12 +389,15 @@ def dead_error(self) -> BaseException: class SignalHandler: + def __init__(self, async_llm): self.async_llm = async_llm def signal_handler(self, signum=None, frame=None): logger.warning( - "SIGTERM received. signum=%s frame=%s. Draining " - "requests and shutting down...", signum, frame, + "SIGTERM received. signum=%s frame=%s. Draining " + "requests and shutting down...", + signum, + frame, ) self.async_llm.gracefully_exit = True diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 76cdd027ec319..e2970ac9cfa70 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -325,4 +325,4 @@ def process_output_socket(self, output_path: str): engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) encoder.encode_into(outputs, buffer) - socket.send_multipart((buffer,), copy=False) + socket.send_multipart((buffer, ), copy=False) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 181d6e3874d70..ba8b4c203801f 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -4,7 +4,7 @@ from vllm.logger import init_logger from vllm.utils import kill_process_tree, get_open_zmq_ipc_path -from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, +from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, EngineRequest) from vllm.v1.engine.core import (EngineCore, EngineCoreProc) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index b141110e8c2ab..492c9094f8307 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -80,18 +80,15 @@ def __len__(self): return len(self._x) -def make_zmq_socket( - ctx: Union[zmq.asyncio.Context, zmq.Context], - path: str, - type: Any - ) -> Union[zmq.Socket, zmq.asyncio.Socket]: +def make_zmq_socket(ctx: Union[zmq.asyncio.Context, zmq.Context], path: str, + type: Any) -> Union[zmq.Socket, zmq.asyncio.Socket]: """Make a ZMQ socket with the proper bind/connext semantics.""" import psutil mem = psutil.virtual_memory() socket = ctx.socket(type) - + total_mem = mem.total / 1024**3 available_mem = mem.available / 1024**3 if total_mem > 32 and available_mem > 16: @@ -112,6 +109,7 @@ def make_zmq_socket( return socket + @contextmanager def zmq_socket_ctx( path: str, @@ -151,4 +149,3 @@ def wait_for_startup( except BaseException as e: logger.exception(e) raise e - From 6ec9dcb122728117e952d003ab06d3e0df1d168b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 15:57:50 +0000 Subject: [PATCH 083/132] updated --- vllm/v1/engine/async_llm.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 81d65aec2892f..9d3606f0d7229 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -68,6 +68,21 @@ def __init__( # RequestId -> OutputQueue. self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {} + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + parallel_config=vllm_config.parallel_config, + lora_config=vllm_config.lora_config) + self.tokenizer.ping() + + # Processor (converts Inputs --> EngineRequest). + self.processor = Processor(model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + tokenizer=self.tokenizer, + input_registry=input_registry) + # IPC paths. engine_core_outputs_path = get_open_zmq_ipc_path() engine_core_inputs_path = get_open_zmq_ipc_path() @@ -91,23 +106,6 @@ def __init__( usage_context=usage_context, ) - # Tokenizer (+ ensure liveness if running in another process). - # Note: make last to avoid fork before using tokenizers - # and avoid TOKENIZERS_PARALLELISM issues. - self.tokenizer = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - parallel_config=vllm_config.parallel_config, - lora_config=vllm_config.lora_config) - self.tokenizer.ping() - - # Processor (converts Inputs --> EngineRequest). - self.processor = Processor(model_config=vllm_config.model_config, - cache_config=vllm_config.cache_config, - lora_config=vllm_config.lora_config, - tokenizer=self.tokenizer, - input_registry=input_registry) - # Create output handler loop during first call to generate(). self.to_create_loop = True self.gracefully_exit = False From fc6a20d786a045b73e9e9a24a256374ddb06cf1e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 16:05:36 +0000 Subject: [PATCH 084/132] make pr smaller --- vllm/v1/engine/core.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e2970ac9cfa70..8623612c6264d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -92,8 +92,6 @@ def _initialize_kv_caches(self, def add_request(self, request: EngineRequest): """Add request to the scheduler.""" - logger.debug("Adding request: %s", request.request_id) - if request.mm_hashes is not None: # Here, if hash exists for an image, then it will be fetched # from the cache, else it will be added to the cache. @@ -104,15 +102,12 @@ def add_request(self, request: EngineRequest): request.mm_inputs = self.mm_input_mapper_server.process_inputs( request.mm_inputs, request.mm_hashes) - # TODO: instead of sending EngineRequest, should we just send - # around Request? req = Request.from_engine_core_request(request) self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): """Abort requests from the scheduler.""" - logger.debug("Aborting requests: %s", request_ids) self.scheduler.finish_requests(request_ids, RequestStatus.FINISHED_ABORTED) From 930ccc2ff19ebb606673564384b3784c36eefd54 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 16:06:37 +0000 Subject: [PATCH 085/132] update logging timing --- vllm/v1/engine/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 8623612c6264d..2f864a1a4a334 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -32,7 +32,7 @@ POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = 1 +LOGGING_TIME_S = 5 class EngineCore: From 52d370ff5a87c9773165ef8a5c84d37d91b01342 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 16:11:10 +0000 Subject: [PATCH 086/132] cleanup nits --- vllm/v1/engine/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 7200aa9a208ee..2fec90c2fec6c 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,4 +1,3 @@ -import enum from dataclasses import dataclass from multiprocessing.process import BaseProcess from typing import List, Optional, Union @@ -7,7 +6,7 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict -from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.sampling_params import SamplingParams @dataclass From 8939e2e6d4027d81e946726afad1deb5c6f1491d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 16:11:48 +0000 Subject: [PATCH 087/132] cleanup --- vllm/v1/engine/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 2fec90c2fec6c..ae10e6f3b8c29 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -19,7 +19,6 @@ class BackgroundProcHandle: @dataclass class EngineRequest: - request_id: str prompt: Optional[str] prompt_token_ids: List[int] @@ -34,13 +33,11 @@ class EngineRequest: @dataclass class EngineAbortRequest: - request_ids: List[str] @dataclass class EngineProfileRequest: - is_start: bool From c2c2e570f3623327edbc22b57c5d46ac2dd5245d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 16:31:22 +0000 Subject: [PATCH 088/132] add sigquit handlers for shutdown --- vllm/utils.py | 8 ++++++- vllm/v1/engine/async_llm.py | 26 ++++++++++++++-------- vllm/v1/engine/core.py | 14 +++++++----- vllm/v1/engine/detokenizer.py | 41 ++++++++++++++++++++--------------- 4 files changed, 55 insertions(+), 34 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 2a8af5ea2d5c5..caed96d200bfc 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -16,6 +16,7 @@ import subprocess import sys import tempfile +import traceback import threading import time import uuid @@ -1614,7 +1615,6 @@ def resolve_obj_by_qualname(qualname: str) -> Any: return getattr(module, obj_name) -# Taken from https://github.com/sgl-project/sglang/blob/23e5e50fd5fba7f315e04294f55060a8171fcc69/python/sglang/srt/utils.py#L630 # noqa: E501 def set_ulimit(target_soft_limit=65535): resource_type = resource.RLIMIT_NOFILE current_soft, current_hard = resource.getrlimit(resource_type) @@ -1631,6 +1631,12 @@ def set_ulimit(target_soft_limit=65535): "increasing with ulimit -n", current_soft, e) +def get_exception_traceback(): + etype, value, tb = sys.exc_info() + err_str = "".join(traceback.format_exception(etype, value, tb)) + return err_str + + def kill_process_tree(pid: int): """ Kills all descendant processes of the given pid by sending SIGKILL. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 9d3606f0d7229..230eca6a64e87 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -15,6 +15,7 @@ # Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py +import os import asyncio import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union @@ -34,7 +35,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import get_open_zmq_ipc_path +from vllm.utils import get_open_zmq_ipc_path, kill_process_tree from vllm.v1.engine import EngineAbortRequest from vllm.v1.engine.core_client import MultiprocessEngineCore from vllm.v1.engine.detokenizer import DetokenizerClient @@ -65,6 +66,13 @@ def __init__( self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config + # Register the signal handler. + # The child processes will send SIGQUIT to this process when + # any error happens. This process then clean up the whole tree. + def sigquit_handler(signum, frame): + kill_process_tree(os.getpid()) + signal.signal(signal.SIGQUIT, sigquit_handler) + # RequestId -> OutputQueue. self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {} @@ -84,23 +92,23 @@ def __init__( input_registry=input_registry) # IPC paths. - engine_core_outputs_path = get_open_zmq_ipc_path() - engine_core_inputs_path = get_open_zmq_ipc_path() + from_engine_core_path = get_open_zmq_ipc_path() + to_engine_core_path = get_open_zmq_ipc_path() - # Detokenizer (converts EngineCoreOutputs --> RequestOutput). + # Detokenizer (background process). self.detokenizer = DetokenizerClient( - engine_core_outputs_path=engine_core_outputs_path, - engine_core_inputs_path=engine_core_inputs_path, + from_engine_core_path=from_engine_core_path, + to_engine_core_path=to_engine_core_path, tokenizer_name=vllm_config.model_config.tokenizer, tokenizer_mode=vllm_config.model_config.tokenizer_mode, trust_remote_code=vllm_config.model_config.trust_remote_code, revision=vllm_config.model_config.tokenizer_revision, ) - # EngineCore (starts the engine in background process). + # EngineCore (background process). self.engine_core = MultiprocessEngineCore( - input_path=engine_core_inputs_path, - output_path=engine_core_outputs_path, + input_path=to_engine_core_path, + output_path=from_engine_core_path, vllm_config=vllm_config, executor_class=executor_class, usage_context=usage_context, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 2f864a1a4a334..0a4069cdbb082 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,4 +1,4 @@ -import pickle +import psutil import queue import signal import threading @@ -15,7 +15,7 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext -from vllm.utils import get_open_zmq_ipc_path +from vllm.utils import get_open_zmq_ipc_path, get_exception_traceback from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineAbortRequest, EngineRequest, @@ -24,7 +24,6 @@ from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus -from vllm.v1.serial_utils import PickleEncoder from vllm.v1.utils import zmq_socket_ctx, wait_for_startup from vllm.version import __version__ as VLLM_VERSION @@ -219,6 +218,8 @@ def signal_handler(signum, frame): signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) + parent_process = psutil.Process().parent() + engine_core = None try: engine_core = EngineCoreProc(*args, **kwargs) @@ -227,9 +228,10 @@ def signal_handler(signum, frame): except SystemExit: logger.debug("EngineCore interrupted.") - except BaseException as e: - logger.exception(e) - raise e + except Exception: + traceback = get_exception_traceback() + logger.error(f"EngineCore hit an exception: {traceback}") + parent_process.send_signal(signal.SIGQUIT) finally: if engine_core is not None: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 8e050e4970bf6..64a117f40e22f 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,3 +1,4 @@ +import psutil import pickle import zmq.asyncio import msgspec @@ -13,7 +14,8 @@ from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import get_open_zmq_ipc_path, kill_process_tree +from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, + get_exception_traceback) from vllm.v1.engine import (EngineCoreOutputs, BackgroundProcHandle, EngineRequest, EngineAbortRequest) @@ -279,8 +281,8 @@ class DetokenizerProc(Detokenizer): def __init__( self, *args, - engine_core_outputs_path: str, - engine_core_inputs_path: str, + from_engine_core_path: str, + to_engine_core_path: str, input_path: str, output_path: str, ready_path: str, @@ -288,8 +290,8 @@ def __init__( ): super().__init__(*args, **kwargs) - self.engine_core_outputs_path = engine_core_outputs_path - self.engine_core_inputs_path = engine_core_inputs_path + self.from_engine_core_path = from_engine_core_path + self.to_engine_core_path = to_engine_core_path self.input_path = input_path self.output_path = output_path @@ -300,8 +302,8 @@ def __init__( @staticmethod def make_detokenizer_process( - engine_core_outputs_path: str, - engine_core_inputs_path: str, + from_engine_core_path: str, + to_engine_core_path: str, input_path: str, output_path: str, tokenizer_name: str, @@ -313,8 +315,8 @@ def make_detokenizer_process( ready_path = get_open_zmq_ipc_path() process_kwargs = { - "engine_core_outputs_path": engine_core_outputs_path, - "engine_core_inputs_path": engine_core_inputs_path, + "from_engine_core_path": from_engine_core_path, + "to_engine_core_path": to_engine_core_path, "input_path": input_path, "output_path": output_path, "ready_path": ready_path, @@ -356,6 +358,8 @@ def signal_handler(signum, frame): signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) + parent_process = psutil.Process().parent() + detokenizer = None try: detokenizer = DetokenizerProc(*args, **kwargs) @@ -364,9 +368,10 @@ def signal_handler(signum, frame): except SystemExit: logger.debug("Detokenizer interrupted.") - except BaseException as e: - logger.exception(e) - raise e + except Exception: + traceback = get_exception_traceback() + logger.error(f"Detokenizer hit an exception: {traceback}") + parent_process.send_signal(signal.SIGQUIT) finally: if detokenizer is not None: @@ -437,8 +442,8 @@ def run_busy_loop(self): decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) - with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, - zmq_socket_ctx(self.engine_core_inputs_path, zmq.PUSH) as to_engine_core, + with (zmq_socket_ctx(self.from_engine_core_path, zmq.PULL) as from_engine_core, + zmq_socket_ctx(self.to_engine_core_path, zmq.PUSH) as to_engine_core, zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine, zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine): @@ -474,8 +479,8 @@ class DetokenizerClient: def __init__(self, *args, - engine_core_outputs_path: str, - engine_core_inputs_path: str, + from_engine_core_path: str, + to_engine_core_path: str, **kwargs): # ZMQ setup. @@ -500,8 +505,8 @@ def __init__(self, self.proc_handle: Optional[BackgroundProcHandle] self.proc_handle = DetokenizerProc.make_detokenizer_process( *args, - engine_core_outputs_path=engine_core_outputs_path, - engine_core_inputs_path=engine_core_inputs_path, + from_engine_core_path=from_engine_core_path, + to_engine_core_path=to_engine_core_path, input_path=input_path, output_path=output_path, **kwargs, From 51b498df94c208a3e614025df979d0e3eb74c61a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 16:32:34 +0000 Subject: [PATCH 089/132] updated --- vllm/v1/engine/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0a4069cdbb082..5819e67a24803 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -230,7 +230,7 @@ def signal_handler(signum, frame): except Exception: traceback = get_exception_traceback() - logger.error(f"EngineCore hit an exception: {traceback}") + logger.error("EngineCore hit an exception: %s", traceback) parent_process.send_signal(signal.SIGQUIT) finally: From 84c08b14be993510179b49dfa11bd8287c8d9329 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 17:01:06 +0000 Subject: [PATCH 090/132] signifcantly better error handling --- vllm/entrypoints/openai/api_server.py | 12 +++++++++--- vllm/v1/engine/async_llm.py | 9 +-------- vllm/v1/engine/core.py | 4 +++- vllm/v1/engine/detokenizer.py | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2aa666548bec5..28542c804bbae 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -64,7 +64,7 @@ from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address, set_ulimit) + is_valid_ipv6_address, set_ulimit, kill_process_tree) from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -671,11 +671,17 @@ async def run_server(args, **uvicorn_kwargs) -> None: # workaround to ensure user has enough fds available for uvicorn + ipc set_ulimit() - def signal_handler(*_) -> None: + def sigterm_handler(*_) -> None: # Interrupt server on sigterm while initializing raise KeyboardInterrupt("terminated") + signal.signal(signal.SIGTERM, sigterm_handler) + + # The child processes will send SIGQUIT to this process when + # any error happens. This process then clean up the whole tree. + def sigquit_handler(signum, frame): + kill_process_tree(os.getpid()) + signal.signal(signal.SIGQUIT, sigquit_handler) - signal.signal(signal.SIGTERM, signal_handler) async with build_async_engine_client(args) as engine_client: app = build_app(args) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 230eca6a64e87..9ed77ca8a491a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -60,19 +60,12 @@ def __init__( start_engine_loop: bool = True, ) -> None: assert start_engine_loop - + self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config - # Register the signal handler. - # The child processes will send SIGQUIT to this process when - # any error happens. This process then clean up the whole tree. - def sigquit_handler(signum, frame): - kill_process_tree(os.getpid()) - signal.signal(signal.SIGQUIT, sigquit_handler) - # RequestId -> OutputQueue. self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {} diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5819e67a24803..bf8bf63951aa7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -246,6 +246,8 @@ def run_busy_loop(self): while True: logger.info(f"EPOCH: {epoch}") epoch += 1 + if epoch == 10: + raise ValueError("Died") # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): @@ -260,7 +262,7 @@ def run_busy_loop(self): except BaseException: raise - # 2) Handle any new client requests (Abort or Add). + # 2) Handle any new inputs. while not self.input_queue.empty(): req = self.input_queue.get_nowait() self._handle_client_request(req) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 64a117f40e22f..e71ce00b3cc5f 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -456,7 +456,7 @@ def run_busy_loop(self): epoch = 0 while True: logger.info(f"EPOCH: {epoch}") - socks = dict(poller.poll()) + socks = dict(poller.poll(timeout=1000)) # Handle input from LLMEngine. if from_llm_engine in socks: From 91aceba10f4321d955a959875344d03538d1602d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 18:28:31 +0000 Subject: [PATCH 091/132] proper shutdown of output loop --- vllm/v1/engine/async_llm.py | 117 +++++++++++----------------------- vllm/v1/engine/core.py | 58 +++++++++++++++-- vllm/v1/engine/detokenizer.py | 4 +- 3 files changed, 94 insertions(+), 85 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 9ed77ca8a491a..f909d59dd3e5b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -15,9 +15,7 @@ # Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py -import os import asyncio -import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -35,10 +33,10 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import get_open_zmq_ipc_path, kill_process_tree +from vllm.utils import get_open_zmq_ipc_path from vllm.v1.engine import EngineAbortRequest -from vllm.v1.engine.core_client import MultiprocessEngineCore -from vllm.v1.engine.detokenizer import DetokenizerClient +from vllm.v1.engine.core import MPEngineCoreClient +from vllm.v1.engine.detokenizer import MPDetokenizerClient from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor @@ -89,7 +87,7 @@ def __init__( to_engine_core_path = get_open_zmq_ipc_path() # Detokenizer (background process). - self.detokenizer = DetokenizerClient( + self.detokenizer = MPDetokenizerClient( from_engine_core_path=from_engine_core_path, to_engine_core_path=to_engine_core_path, tokenizer_name=vllm_config.model_config.tokenizer, @@ -99,7 +97,7 @@ def __init__( ) # EngineCore (background process). - self.engine_core = MultiprocessEngineCore( + self.engine_core = MPEngineCoreClient( input_path=to_engine_core_path, output_path=from_engine_core_path, vllm_config=vllm_config, @@ -107,9 +105,7 @@ def __init__( usage_context=usage_context, ) - # Create output handler loop during first call to generate(). - self.to_create_loop = True - self.gracefully_exit = False + self.output_handler: Optional[asyncio.Task] = None self.asyncio_tasks = set() def __del__(self): @@ -148,6 +144,9 @@ def from_engine_args( def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" + if output_handler := getattr(self, "output_hander", None): + output_handler.cancel() + if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() @@ -224,9 +223,13 @@ async def generate( try: # Start output_handler on first request. - if self.to_create_loop: - self.create_output_handler() + if not self.output_handler: + loop = asyncio.get_event_loop() + self.output_handler = loop.create_task( + self.output_handler_loop()) + # Add to Detokenizer and EngineCore and makes queue + # to which the output_handler will push RequestOutputs. q = await self.add_request( request_id, prompt, @@ -260,61 +263,32 @@ async def generate( await self.abort(request_id) raise - def create_output_handler(self): - """Creates output handler loop. Called on first generate().""" - - self.to_create_loop = False - loop = asyncio.get_event_loop() - - # Start output handler. - self.asyncio_tasks.add(loop.create_task(self.output_handler())) - - # Start signal handlers for shutdown. - signal_handler = SignalHandler(self) - loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler) - self.asyncio_tasks.add(loop.create_task(self.sigterm_watchdog())) - - async def sigterm_watchdog(self): - """Handle shutdown from sigterm.""" - - while not self.gracefully_exit: - await asyncio.sleep(5) - # Drain requests - while True: - remain_num_req = len(self.rid_to_state) - logger.info( - f"Gracefully exiting... remaining number of requests {remain_num_req}" - ) - if remain_num_req > 0: - await asyncio.sleep(5) - else: - break - self.shutdown() - - async def output_handler(self): - """Background loop: pulls from Detokenizer and pushes to queues.""" - - epoch = 0 - while True: - logger.info(f"EPOCH: {epoch}") - epoch += 1 - - # 1) Pull outputs from the Detokenizer. - outputs: List[ - RequestOutput] = await self.detokenizer.output_socket.recv_pyobj( - ) - - # 2) Put each output into a per request Queue. - for out in outputs: - if out.request_id not in self.rid_to_queue: - raise RuntimeError(f"{out.request_id} " - "not in RequestStates") + async def output_handler_loop(self): + """Background loop: pulls from Detokenizer and push to Queues.""" - self.rid_to_queue[out.request_id].put_nowait(out) + try: + while True: + # Note: use socket directly to avoid calling await multiple + # times, which causes too much task switching at high QPS. + outputs: List[RequestOutput] = [] + outputs = await self.detokenizer.output_socket.recv_pyobj() + + for out in outputs: + # Note: it is possible that a request was aborted + # due to client cancellation while EngineCoreOutputs + # are still flowing, so we just ignore. + if out.request_id in self.rid_to_queue: + self.rid_to_queue[out.request_id].put_nowait(out) + + except asyncio.CancelledError: + logger.info("Shutting down output_handler_loop") + raise + async def abort(self, request_id: str): - # Remove from Detokenizer and EngineCore (Detokenizer - # forwards the message to EngineCore). + """Abort request if the client cancels the request.""" + + # Send abort to Detokenizer (which will fwd to EngineCore) await self.detokenizer.input_socket.send_pyobj( EngineAbortRequest([request_id])) @@ -385,18 +359,3 @@ def errored(self) -> bool: @property def dead_error(self) -> BaseException: return Exception() # TODO: implement - - -class SignalHandler: - - def __init__(self, async_llm): - self.async_llm = async_llm - - def signal_handler(self, signum=None, frame=None): - logger.warning( - "SIGTERM received. signum=%s frame=%s. Draining " - "requests and shutting down...", - signum, - frame, - ) - self.async_llm.gracefully_exit = True diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index bf8bf63951aa7..e9557064ef900 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,7 +3,9 @@ import signal import threading import time -from typing import List, Tuple, Type +import os +import weakref +from typing import List, Optional, Tuple, Type import zmq import zmq.asyncio @@ -15,7 +17,8 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext -from vllm.utils import get_open_zmq_ipc_path, get_exception_traceback +from vllm.utils import (get_open_zmq_ipc_path, get_exception_traceback, + kill_process_tree) from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineAbortRequest, EngineRequest, @@ -246,8 +249,8 @@ def run_busy_loop(self): while True: logger.info(f"EPOCH: {epoch}") epoch += 1 - if epoch == 10: - raise ValueError("Died") + # if epoch == 10: + # raise ValueError("Died") # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): @@ -325,3 +328,50 @@ def process_output_socket(self, output_path: str): outputs = EngineCoreOutputs(outputs=engine_core_outputs) encoder.encode_into(outputs, buffer) socket.send_multipart((buffer, ), copy=False) + + +class MPEngineCoreClient: + """ + MPEngineCoreClient: client for multi-proc EngineCore. + EngineCore runs in a background process busy loop, getting + new EngineRequests and returning EngineCoreOutputs + + * pushes EngineRequests via input_socket + * pulls EngineCoreOutputs via output_socket + """ + + def __init__(self, *args, input_path: str, output_path: str, **kwargs): + # Start EngineCore in background process. + self.proc_handle: Optional[BackgroundProcHandle] + self.proc_handle = EngineCoreProc.make_engine_core_process( + *args, + input_path=input_path, + output_path=output_path, + **kwargs, + ) + self._finalizer = weakref.finalize(self, self.shutdown) + + def shutdown(self): + if hasattr(self, "proc_handle") and self.proc_handle: + # Shutdown the process if needed. + if self.proc_handle.proc.is_alive(): + self.proc_handle.proc.terminate() + self.proc_handle.proc.join(5) + + if self.proc_handle.proc.is_alive(): + kill_process_tree(self.proc_handle.proc.pid) + + # Remove zmq ipc socket files + ipc_sockets = [ + self.proc_handle.ready_path, + self.proc_handle.output_path, + self.proc_handle.input_path + ] + for ipc_socket in ipc_sockets: + socket_file = ipc_socket.replace("ipc://", "") + if os and os.path.exists(socket_file): + os.remove(socket_file) + self.proc_handle = None + + def __del__(self): + self.shutdown() diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index e71ce00b3cc5f..f8b388f9ff9d0 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -456,7 +456,7 @@ def run_busy_loop(self): epoch = 0 while True: logger.info(f"EPOCH: {epoch}") - socks = dict(poller.poll(timeout=1000)) + socks = dict(poller.poll()) # Handle input from LLMEngine. if from_llm_engine in socks: @@ -475,7 +475,7 @@ def run_busy_loop(self): decoder=decoder, ) -class DetokenizerClient: +class MPDetokenizerClient: def __init__(self, *args, From 87e7ebd9937b2c2990381f13af6ce7124ceb4285 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 18:29:42 +0000 Subject: [PATCH 092/132] update comment --- vllm/v1/engine/async_llm.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index f909d59dd3e5b..441eab9adaf12 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -257,8 +257,7 @@ async def generate( yield out # Client request cancellation is handled through calling - # task.cancel() on generate. So if we get this error, we - # need to abort the request. + # task.cancel() on generate. So we call abort if canceled. except asyncio.CancelledError: await self.abort(request_id) raise @@ -266,23 +265,18 @@ async def generate( async def output_handler_loop(self): """Background loop: pulls from Detokenizer and push to Queues.""" - try: - while True: - # Note: use socket directly to avoid calling await multiple - # times, which causes too much task switching at high QPS. - outputs: List[RequestOutput] = [] - outputs = await self.detokenizer.output_socket.recv_pyobj() - - for out in outputs: - # Note: it is possible that a request was aborted - # due to client cancellation while EngineCoreOutputs - # are still flowing, so we just ignore. - if out.request_id in self.rid_to_queue: - self.rid_to_queue[out.request_id].put_nowait(out) - - except asyncio.CancelledError: - logger.info("Shutting down output_handler_loop") - raise + while True: + # Note: use socket directly to avoid calling await multiple + # times, which causes too much task switching at high QPS. + outputs: List[RequestOutput] = [] + outputs = await self.detokenizer.output_socket.recv_pyobj() + + for out in outputs: + # Note: it is possible that a request was aborted + # due to client cancellation while EngineCoreOutputs + # are still flowing, so we just ignore. + if out.request_id in self.rid_to_queue: + self.rid_to_queue[out.request_id].put_nowait(out) async def abort(self, request_id: str): From 2e3257c136e9d29b2d7802b126a72e5c63909f3a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 18:33:24 +0000 Subject: [PATCH 093/132] updated --- vllm/v1/engine/detokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index f8b388f9ff9d0..8320d3260954a 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -456,6 +456,7 @@ def run_busy_loop(self): epoch = 0 while True: logger.info(f"EPOCH: {epoch}") + socks = dict(poller.poll()) # Handle input from LLMEngine. From 3b13d89644fd3604c3adcc90ea31eb10c39460e5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 18:59:03 +0000 Subject: [PATCH 094/132] support in LLMEngine --- vllm/v1/engine/async_llm.py | 7 +- vllm/v1/engine/core.py | 1 - vllm/v1/engine/core_client.py | 158 ---------------------------------- vllm/v1/engine/llm_engine.py | 92 ++++++++++++-------- 4 files changed, 58 insertions(+), 200 deletions(-) delete mode 100644 vllm/v1/engine/core_client.py diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 441eab9adaf12..586db9ef804c0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -106,7 +106,6 @@ def __init__( ) self.output_handler: Optional[asyncio.Task] = None - self.asyncio_tasks = set() def __del__(self): self.shutdown() @@ -257,18 +256,20 @@ async def generate( yield out # Client request cancellation is handled through calling - # task.cancel() on generate. So we call abort if canceled. + # task.cancel() on generate(). Calling self.abort() forwards the + # cancellation to the EngineCore and Detokenizer. except asyncio.CancelledError: await self.abort(request_id) raise + async def output_handler_loop(self): """Background loop: pulls from Detokenizer and push to Queues.""" while True: # Note: use socket directly to avoid calling await multiple # times, which causes too much task switching at high QPS. - outputs: List[RequestOutput] = [] + outputs: List[RequestOutput] = [] outputs = await self.detokenizer.output_socket.recv_pyobj() for out in outputs: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e9557064ef900..0f12ddceaa026 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -329,7 +329,6 @@ def process_output_socket(self, output_path: str): encoder.encode_into(outputs, buffer) socket.send_multipart((buffer, ), copy=False) - class MPEngineCoreClient: """ MPEngineCoreClient: client for multi-proc EngineCore. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py deleted file mode 100644 index ba8b4c203801f..0000000000000 --- a/vllm/v1/engine/core_client.py +++ /dev/null @@ -1,158 +0,0 @@ -import os -import weakref -from typing import List, Optional - -from vllm.logger import init_logger -from vllm.utils import kill_process_tree, get_open_zmq_ipc_path -from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, - EngineRequest) -from vllm.v1.engine.core import (EngineCore, EngineCoreProc) - -logger = init_logger(__name__) - - -class EngineCoreClient: - """ - EngineCoreClient: subclasses handle different methods for pushing - and pulling from the EngineCore for asyncio / multiprocessing. - - Subclasses: - * InprocClient: In process EngineCore (for V0-style LLMEngine use) - * SyncMPClient: ZMQ + background proc EngineCore (for LLM) - * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM) - """ - - @staticmethod - def make_client( - *args, - multiprocess_mode: bool, - asyncio_mode: bool, - **kwargs, - ) -> "EngineCoreClient": - - # TODO: support this for debugging purposes. - if asyncio_mode and not multiprocess_mode: - raise NotImplementedError( - "Running EngineCore in asyncio without multiprocessing " - "is not currently supported.") - - if multiprocess_mode and asyncio_mode: - return AsyncMPClient(*args, **kwargs) - - if multiprocess_mode and not asyncio_mode: - return SyncMPClient(*args, **kwargs) - - return InprocClient(*args, **kwargs) - - def shutdown(self): - pass - - def get_output(self) -> List[EngineCoreOutput]: - raise NotImplementedError - - def add_request(self, request: EngineRequest) -> None: - raise NotImplementedError - - def profile(self, is_start: bool = True) -> None: - raise NotImplementedError - - def abort_requests(self, request_ids: List[str]) -> None: - raise NotImplementedError - - async def get_output_async(self) -> List[EngineCoreOutput]: - raise NotImplementedError - - async def add_request_async(self, request: EngineRequest) -> None: - raise NotImplementedError - - async def profile_async(self, is_start: bool = True) -> None: - raise NotImplementedError - - async def abort_requests_async(self, request_ids: List[str]) -> None: - raise NotImplementedError - - -class InprocClient(EngineCoreClient): - """ - InprocClient: client for in-process EngineCore. Intended - for use in LLMEngine for V0-style add_request() and step() - EngineCore setup in this process (no busy loop). - - * pushes EngineRequest directly into the EngineCore - * pulls EngineCoreOutputs by stepping the EngineCore - - TODO: support asyncio-mode for debugging. - """ - - def __init__(self, *args, **kwargs): - self.engine_core = EngineCore(*args, **kwargs) - - def get_output(self) -> List[EngineCoreOutput]: - return self.engine_core.step() - - def add_request(self, request: EngineRequest) -> None: - self.engine_core.add_request(request) - - def abort_requests(self, request_ids: List[str]) -> None: - self.engine_core.abort_requests(request_ids) - - def shutdown(self): - self.engine_core.shutdown() - - def __del__(self): - self.shutdown() - - def profile(self, is_start: bool = True) -> None: - self.engine_core.profile(is_start) - - -class MultiprocessEngineCore: - """ - MultiprocessEngineCore: base client for multi-proc EngineCore. - EngineCore runs in a background process busy loop, getting - new EngineRequests and returning EngineCoreOutputs - - * pushes EngineRequests via input_socket - * pulls EngineCoreOutputs via output_socket - """ - - def __init__( - self, - *args, - input_path: Optional[str] = None, - output_path: Optional[str] = None, - **kwargs, - ): - # Start EngineCore in background process. - self.proc_handle: Optional[BackgroundProcHandle] - self.proc_handle = EngineCoreProc.make_engine_core_process( - *args, - input_path=(input_path or get_open_zmq_ipc_path()), - output_path=(output_path or get_open_zmq_ipc_path()), - **kwargs, - ) - self._finalizer = weakref.finalize(self, self.shutdown) - - def shutdown(self): - if hasattr(self, "proc_handle") and self.proc_handle: - # Shutdown the process if needed. - if self.proc_handle.proc.is_alive(): - self.proc_handle.proc.terminate() - self.proc_handle.proc.join(5) - - if self.proc_handle.proc.is_alive(): - kill_process_tree(self.proc_handle.proc.pid) - - # Remove zmq ipc socket files - ipc_sockets = [ - self.proc_handle.ready_path, self.proc_handle.output_path, - self.proc_handle.input_path - ] - for ipc_socket in ipc_sockets: - socket_file = ipc_socket.replace("ipc://", "") - if os and os.path.exists(socket_file): - os.remove(socket_file) - self.proc_handle = None - - def __del__(self): - self.shutdown() diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 8269877bb9c8f..11a9f01a54cf8 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -42,8 +42,8 @@ def __init__( use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: - - # TODO: Can we avoid this? + + self.mulitprocess_mode = multiprocess_mode self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). @@ -62,22 +62,32 @@ def __init__( input_registry=input_registry, mm_registry=mm_registry) - # Detokenizer (converts EngineCoreOutputs --> RequestOutput) - self.detokenizer = Detokenizer( - tokenizer_name=vllm_config.model_config.tokenizer, - tokenizer_mode=vllm_config.model_config.tokenizer_mode, - trust_remote_code=vllm_config.model_config.trust_remote_code, - revision=vllm_config.model_config.tokenizer_revision, - ) - - # EngineCore (gets EngineRequests and gives EngineCoreOutputs) - self.engine_core = EngineCoreClient.make_client( - vllm_config, - executor_class, - usage_context, - multiprocess_mode=multiprocess_mode, - asyncio_mode=False, - ) + if self.multiprocess_mode: + # IPC paths. + from_engine_core_path = get_open_zmq_ipc_path() + to_engine_core_path = get_open_zmq_ipc_path() + + # Detokenizer (background process). + self.detokenizer_client = MPDetokenizerClient( + from_engine_core_path=from_engine_core_path, + to_engine_core_path=to_engine_core_path, + tokenizer_name=vllm_config.model_config.tokenizer, + tokenizer_mode=vllm_config.model_config.tokenizer_mode, + trust_remote_code=vllm_config.model_config.trust_remote_code, + revision=vllm_config.model_config.tokenizer_revision, + ) + + # EngineCore (background process). + self.engine_core_client = MPEngineCoreClient( + input_path=to_engine_core_path, + output_path=from_engine_core_path, + vllm_config=vllm_config, + executor_class=executor_class, + usage_context=usage_context, + ) + + else: + @classmethod def from_engine_args( @@ -149,32 +159,38 @@ def add_request( ) -> None: # 1) Process raw inputs into the request. - detokenizer_req, engine_core_req = self.processor.process_inputs( + engine_request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 2) Add the request to Detokenizer. - self.detokenizer.add_request(detokenizer_req) - - # 3) Add the request to EngineCore. - self.engine_core.add_request(engine_core_req) + # 2) Add to Detokenizer and EngineCore. + if self.multiprocess_mode: + # Send to Detokenizer (which forwards to EngineCore). + self.detokenizer.input_socket.send_pyobj(engine_request) + else: + # Add directly to Detokenizer and EngineCore. + self.detokenizer.add_request(engine_request) + self.engine_core.add_request(engine_request) def step(self) -> List[RequestOutput]: + + if self.multiprocess_mode: + # Get next output from the Detokenizer. + return self.detokenizer.output_socket.recv_pyobj() - # 1) Get EngineCoreOutput from the EngineCore. - engine_core_outputs = self.engine_core.get_output() - - # 2) Detokenizer the EngineCoreOutput. - request_outputs, requests_to_abort = self.detokenizer.step( - engine_core_outputs) - - # 3) Abort requests that finished due to stopping criteria. - if requests_to_abort: - self.abort_request(requests_to_abort) - - return request_outputs - - # TODO(rob): Can we get rid of these? + else: + # 1) Get EngineCoreOutput from the EngineCore. + engine_core_outputs = self.engine_core.step() + + # 2) Detokenizee the EngineCoreOutput. + request_outputs, request_to_abort = self.detokenizer.step( + engine_core_outputs) + + # 3) Abort requests that finished due to stopping criteria. + if requests_to_abort: + self.abort_request(requests_to_abort) + + return request_outputs def get_model_config(self): return self.model_config From 6f383f28db77b5306443faf96b1c27ec59255e12 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:09:14 +0000 Subject: [PATCH 095/132] updated --- vllm/v1/engine/detokenizer.py | 14 +++++++++++++- vllm/v1/engine/llm_engine.py | 27 +++++++++++++++++++++------ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 8320d3260954a..dfd54bf8502e6 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -235,7 +235,19 @@ def abort_requests( for request_id in request_ids: self.request_states.pop(request_id, None) - + + def add_request( + self, + request: EngineRequest, + ): + """Add new request to the Detokenizer.""" + + assert (request.request_id not in self.request_states) + + request_state = IncrementalDetokenizer.from_new_request( + self.tokenizer, request) + self.request_states[request.request_id] = request_state + def step( self, encore_core_outputs: EngineCoreOutputs, ) -> Tuple[List[RequestOutput], List[str]]: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 11a9f01a54cf8..0e845b94ecd2d 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -17,8 +17,9 @@ from vllm.transformers_utils.tokenizer_group import ( BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext -from vllm.v1.engine.core_client import EngineCoreClient -from vllm.v1.engine.detokenizer import Detokenizer +from vllm.utils import get_open_zmq_ipc_path +from vllm.v1.engine.core import EngineCore, MPEngineCoreClient +from vllm.v1.engine.detokenizer import Detokenizer, MPDetokenizerClient from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor @@ -43,7 +44,7 @@ def __init__( multiprocess_mode: bool = False, ) -> None: - self.mulitprocess_mode = multiprocess_mode + self.multiprocess_mode = multiprocess_mode self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). @@ -87,7 +88,20 @@ def __init__( ) else: + # Detokenizer (in process). + self.detokenizer = Detokenizer( + tokenizer_name=vllm_config.model_config.tokenizer, + tokenizer_mode=vllm_config.model_config.tokenizer_mode, + trust_remote_code=vllm_config.model_config.trust_remote_code, + revision=vllm_config.model_config.tokenizer_revision, + ) + # EngineCore (in process). + self.engine_core = EngineCore( + vllm_config=vllm_config, + executor_class=executor_class, + usage_context=usage_context, + ) @classmethod def from_engine_args( @@ -143,6 +157,7 @@ def validate_outputs(cls, outputs, output_type): def abort_request(self, request_ids: List[str]) -> None: """Remove request_ids from EngineCore and Detokenizer.""" + assert not self.multiprocess_mode self.engine_core.abort_requests(request_ids) self.detokenizer.abort_requests(request_ids) @@ -166,7 +181,7 @@ def add_request( # 2) Add to Detokenizer and EngineCore. if self.multiprocess_mode: # Send to Detokenizer (which forwards to EngineCore). - self.detokenizer.input_socket.send_pyobj(engine_request) + self.detokenizer_client.input_socket.send_pyobj(engine_request) else: # Add directly to Detokenizer and EngineCore. self.detokenizer.add_request(engine_request) @@ -176,14 +191,14 @@ def step(self) -> List[RequestOutput]: if self.multiprocess_mode: # Get next output from the Detokenizer. - return self.detokenizer.output_socket.recv_pyobj() + return self.detokenizer_client.output_socket.recv_pyobj() else: # 1) Get EngineCoreOutput from the EngineCore. engine_core_outputs = self.engine_core.step() # 2) Detokenizee the EngineCoreOutput. - request_outputs, request_to_abort = self.detokenizer.step( + request_outputs, requests_to_abort = self.detokenizer.step( engine_core_outputs) # 3) Abort requests that finished due to stopping criteria. From 30d7333fd8d5ddc330e460d8fb47129bb61ae6e8 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:09:35 +0000 Subject: [PATCH 096/132] nit --- vllm/v1/engine/detokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index dfd54bf8502e6..5ed1327630269 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -201,6 +201,7 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str: return self.output_text[last_offset:length] return "" + class Detokenizer: def __init__(self, From bd49c9cb42be711414ed19036835bba051066f62 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:10:26 +0000 Subject: [PATCH 097/132] updated --- vllm/v1/engine/detokenizer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 5ed1327630269..4ec5808660883 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -235,18 +235,18 @@ def abort_requests( """Remove the request_ids from the Detokenizer.""" for request_id in request_ids: - self.request_states.pop(request_id, None) - + self.request_states.pop(request_id, None) + def add_request( self, request: EngineRequest, ): - """Add new request to the Detokenizer.""" + """Add new request to the Detokenizer.""" assert (request.request_id not in self.request_states) - request_state = IncrementalDetokenizer.from_new_request( - self.tokenizer, request) + request_state = IncrementalDetokenizer.from_new_request( + self.tokenizer, request) self.request_states[request.request_id] = request_state def step( From 2192ae61e62df4777c20c2cdb97a4f8758cb204c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:11:15 +0000 Subject: [PATCH 098/132] make PR cleaner --- vllm/entrypoints/openai/api_server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 28542c804bbae..ed3634a50e7e2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -674,12 +674,14 @@ async def run_server(args, **uvicorn_kwargs) -> None: def sigterm_handler(*_) -> None: # Interrupt server on sigterm while initializing raise KeyboardInterrupt("terminated") + signal.signal(signal.SIGTERM, sigterm_handler) # The child processes will send SIGQUIT to this process when # any error happens. This process then clean up the whole tree. def sigquit_handler(signum, frame): kill_process_tree(os.getpid()) + signal.signal(signal.SIGQUIT, sigquit_handler) From 611d1b0ba997eefc747002b83ae1254620cdd2b3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:12:20 +0000 Subject: [PATCH 099/132] make PR cleaner --- vllm/v1/engine/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0f12ddceaa026..5270f4ebc801f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -110,6 +110,9 @@ def add_request(self, request: EngineRequest): def abort_requests(self, request_ids: List[str]): """Abort requests from the scheduler.""" + # TODO: The scheduler doesn't really need to know the + # specific finish reason, TBD whether we propagate that + # (i.e. client-aborted vs stop criteria met). self.scheduler.finish_requests(request_ids, RequestStatus.FINISHED_ABORTED) From b12d0e6923b61b59e9042a1162a339f18f0fc47b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:15:06 +0000 Subject: [PATCH 100/132] make pr cleaner --- vllm/v1/engine/detokenizer.py | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 4ec5808660883..3789979f8c8d4 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -237,13 +237,13 @@ def abort_requests( for request_id in request_ids: self.request_states.pop(request_id, None) - def add_request( - self, - request: EngineRequest, - ): + def add_request( + self, + request: EngineRequest, + ): """Add new request to the Detokenizer.""" - assert (request.request_id not in self.request_states) + assert (request.request_id not in self.request_states) request_state = IncrementalDetokenizer.from_new_request( self.tokenizer, request) @@ -400,25 +400,10 @@ def _handle_from_llm_engine( pickled_req = from_llm_engine.recv() req = pickle.loads(pickled_req) - # Request added by client, add to RequestStates. if isinstance(req, EngineRequest): - if req.request_id in self.request_states: - raise ValueError( - f"{req.request_id} already in Request States!") - - # Add to RequestStates. - request_state = IncrementalDetokenizer.from_new_request( - self.tokenizer, req) - self.request_states[req.request_id] = request_state - - # Request aborted by client, delete from RequestStates. + self.add_request(req) elif isinstance(req, EngineAbortRequest): - if req.request_id not in self.request_states: - # If not found, the request is already completed - # and we can safely ignore. - pass - del self.request_states[req.request_id] - + self.abort_requests(req.request_ids) else: raise ValueError(f"Unknown type: {req}") From 1dac1f1b9bf59f6fceccd7ed1f6994196cff6a5b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:15:53 +0000 Subject: [PATCH 101/132] more cleanup --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 586db9ef804c0..380266419df52 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -58,7 +58,7 @@ def __init__( start_engine_loop: bool = True, ) -> None: assert start_engine_loop - + self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers = stat_loggers From 40c5cd5348fcb9c41533e35185fe47674edf0dfe Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:16:42 +0000 Subject: [PATCH 102/132] more cleanup --- vllm/v1/engine/async_llm.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 380266419df52..7f2a597bbdd97 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -76,11 +76,13 @@ def __init__( self.tokenizer.ping() # Processor (converts Inputs --> EngineRequest). - self.processor = Processor(model_config=vllm_config.model_config, - cache_config=vllm_config.cache_config, - lora_config=vllm_config.lora_config, - tokenizer=self.tokenizer, - input_registry=input_registry) + self.processor = Processor( + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + tokenizer=self.tokenizer, + input_registry=input_registry, + ) # IPC paths. from_engine_core_path = get_open_zmq_ipc_path() From a1e17c41977b88a9640806e2d21d4d48a2cee197 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:18:48 +0000 Subject: [PATCH 103/132] updated --- vllm/v1/engine/async_llm.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7f2a597bbdd97..3601085801a4c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -89,7 +89,7 @@ def __init__( to_engine_core_path = get_open_zmq_ipc_path() # Detokenizer (background process). - self.detokenizer = MPDetokenizerClient( + self.detokenizer_client = MPDetokenizerClient( from_engine_core_path=from_engine_core_path, to_engine_core_path=to_engine_core_path, tokenizer_name=vllm_config.model_config.tokenizer, @@ -99,7 +99,7 @@ def __init__( ) # EngineCore (background process). - self.engine_core = MPEngineCoreClient( + self.engine_core_client = MPEngineCoreClient( input_path=to_engine_core_path, output_path=from_engine_core_path, vllm_config=vllm_config, @@ -148,11 +148,11 @@ def shutdown(self): if output_handler := getattr(self, "output_hander", None): output_handler.cancel() - if engine_core := getattr(self, "engine_core", None): - engine_core.shutdown() + if engine_core_client := getattr(self, "engine_core_client", None): + engine_core_client.shutdown() - if detokenizer := getattr(self, "detokenizer", None): - detokenizer.shutdown() + if detokenizer_client := getattr(self, "detokenizer_client", None): + detokenizer_client.shutdown() @classmethod def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: @@ -190,7 +190,7 @@ async def add_request( self.rid_to_queue[request_id] = asyncio.Queue() # 3) Send to Detokenizer (which forwards to EngineCore). - await self.detokenizer.input_socket.send_pyobj(engine_request) + await self.detokenizer_client.input_socket.send_pyobj(engine_request) return self.rid_to_queue[request_id] @@ -272,7 +272,7 @@ async def output_handler_loop(self): # Note: use socket directly to avoid calling await multiple # times, which causes too much task switching at high QPS. outputs: List[RequestOutput] = [] - outputs = await self.detokenizer.output_socket.recv_pyobj() + outputs = await self.detokenizer_client.output_socket.recv_pyobj() for out in outputs: # Note: it is possible that a request was aborted @@ -286,7 +286,7 @@ async def abort(self, request_id: str): """Abort request if the client cancels the request.""" # Send abort to Detokenizer (which will fwd to EngineCore) - await self.detokenizer.input_socket.send_pyobj( + await self.detokenizer_client.input_socket.send_pyobj( EngineAbortRequest([request_id])) # Remove from request output queues. @@ -336,10 +336,10 @@ async def check_health(self) -> None: logger.debug("Called check_health.") async def start_profile(self) -> None: - await self.engine_core.profile_async(True) + await self.engine_core_client.profile_async(True) async def stop_profile(self) -> None: - await self.engine_core.profile_async(False) + await self.engine_core_client.profile_async(False) @property def is_running(self) -> bool: From 921a56aeb1bf1c9c36b36a4602d1c46ffbba7f36 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:19:38 +0000 Subject: [PATCH 104/132] updated comment --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 3601085801a4c..23c6c1938e091 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -75,7 +75,7 @@ def __init__( lora_config=vllm_config.lora_config) self.tokenizer.ping() - # Processor (converts Inputs --> EngineRequest). + # Processor (in process). self.processor = Processor( model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, From 19aadbb1195481577c9f53bfc8295009f436ca73 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:20:49 +0000 Subject: [PATCH 105/132] updated --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 23c6c1938e091..fe755df95cbe7 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -285,7 +285,7 @@ async def output_handler_loop(self): async def abort(self, request_id: str): """Abort request if the client cancels the request.""" - # Send abort to Detokenizer (which will fwd to EngineCore) + # Send abort to Detokenizer (which will fwd to EngineCore). await self.detokenizer_client.input_socket.send_pyobj( EngineAbortRequest([request_id])) From ddae79cf2934ff659af14b72eb03471d52eea8ef Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:23:05 +0000 Subject: [PATCH 106/132] updated --- vllm/v1/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 0e845b94ecd2d..9bd26ded88b53 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -43,7 +43,7 @@ def __init__( use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: - + self.multiprocess_mode = multiprocess_mode self.model_config = vllm_config.model_config From 4b00ae04c454f34813635b9e5b8500a757b64dcf Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:47:32 +0000 Subject: [PATCH 107/132] factor out proc handle code --- vllm/v1/engine/__init__.py | 20 +++++++++++++-- vllm/v1/engine/core.py | 48 +++++++++++------------------------ vllm/v1/engine/detokenizer.py | 25 +++++++++--------- vllm/v1/utils.py | 26 ++----------------- 4 files changed, 47 insertions(+), 72 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index ae10e6f3b8c29..849592d05b1b2 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,3 +1,4 @@ +import os from dataclasses import dataclass from multiprocessing.process import BaseProcess from typing import List, Optional, Union @@ -7,15 +8,30 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.sampling_params import SamplingParams - +from vllm.utils import kill_process_tree @dataclass class BackgroundProcHandle: proc: BaseProcess - ready_path: str input_path: str output_path: str + def shutdown(self): + # Shutdown the process if needed. + if self.proc.is_alive(): + self.proc.terminate() + self.proc.join(5) + + if self.proc.is_alive(): + kill_process_tree(self.proc.pid) + + # Remove zmq ipc socket files + ipc_sockets = [self.output_path, self.input_path] + for ipc_socket in ipc_sockets: + socket_file = ipc_socket.replace("ipc://", "") + if os and os.path.exists(socket_file): + os.remove(socket_file) + @dataclass class EngineRequest: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5270f4ebc801f..f4edb1b4e12a2 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -10,6 +10,7 @@ import zmq import zmq.asyncio from msgspec import msgpack +from multiprocessing.connection import Connection from vllm.config import CacheConfig, VllmConfig from vllm.executor.multiproc_worker_utils import get_mp_context @@ -17,8 +18,7 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext -from vllm.utils import (get_open_zmq_ipc_path, get_exception_traceback, - kill_process_tree) +from vllm.utils import get_exception_traceback, kill_process_tree from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineAbortRequest, EngineRequest, @@ -27,7 +27,7 @@ from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus -from vllm.v1.utils import zmq_socket_ctx, wait_for_startup +from vllm.v1.utils import zmq_socket_ctx from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -138,8 +138,6 @@ def profile(self, is_start: bool = True): class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" - READY_STR = "READY" - def __init__( self, vllm_config: VllmConfig, @@ -147,7 +145,7 @@ def __init__( usage_context: UsageContext, input_path: str, output_path: str, - ready_path: str, + ready_pipe: Connection, ): super().__init__(vllm_config, executor_class, usage_context) @@ -166,8 +164,7 @@ def __init__( daemon=True).start() # Send Readiness signal to EngineClient. - with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket: - ready_socket.send_string(EngineCoreProc.READY_STR) + ready_pipe.send({"status": "READY"}) @staticmethod def make_engine_core_process( @@ -178,27 +175,29 @@ def make_engine_core_process( output_path: str, ) -> BackgroundProcHandle: context = get_mp_context() - ready_path = get_open_zmq_ipc_path() + reader, writer = context.Pipe(duplex=False) process_kwargs = { "input_path": input_path, "output_path": output_path, - "ready_path": ready_path, + "ready_pipe": writer, "vllm_config": vllm_config, "executor_class": executor_class, "usage_context": usage_context, } + # Run EngineCore busy loop in background process. proc = context.Process(target=EngineCoreProc.run_engine_core, kwargs=process_kwargs) proc.start() - wait_for_startup(proc=proc, - ready_path=ready_path, - ready_str=EngineCoreProc.READY_STR, - timeout_ms=POLLING_TIMEOUT_MS) + + # Wait for startup. + if reader.recv()["status"] != "READY": + raise RuntimeError( + "EngineCore initalization failed. See root cause above." + ) return BackgroundProcHandle(proc=proc, - ready_path=ready_path, input_path=input_path, output_path=output_path) @@ -355,24 +354,7 @@ def __init__(self, *args, input_path: str, output_path: str, **kwargs): def shutdown(self): if hasattr(self, "proc_handle") and self.proc_handle: - # Shutdown the process if needed. - if self.proc_handle.proc.is_alive(): - self.proc_handle.proc.terminate() - self.proc_handle.proc.join(5) - - if self.proc_handle.proc.is_alive(): - kill_process_tree(self.proc_handle.proc.pid) - - # Remove zmq ipc socket files - ipc_sockets = [ - self.proc_handle.ready_path, - self.proc_handle.output_path, - self.proc_handle.input_path - ] - for ipc_socket in ipc_sockets: - socket_file = ipc_socket.replace("ipc://", "") - if os and os.path.exists(socket_file): - os.remove(socket_file) + self.proc_handle.shutdown() self.proc_handle = None def __del__(self): diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 3789979f8c8d4..be9857f1f6aa0 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -298,7 +298,7 @@ def __init__( to_engine_core_path: str, input_path: str, output_path: str, - ready_path: str, + write_: str, **kwargs ): super().__init__(*args, **kwargs) @@ -325,14 +325,14 @@ def make_detokenizer_process( revision: Optional[str] = None, ) -> BackgroundProcHandle: context = get_mp_context() - ready_path = get_open_zmq_ipc_path() + reader, writer = context.Pipe(duplex=False) process_kwargs = { "from_engine_core_path": from_engine_core_path, "to_engine_core_path": to_engine_core_path, "input_path": input_path, "output_path": output_path, - "ready_path": ready_path, + "ready_pipe": writer, "tokenizer_name": tokenizer_name, "tokenizer_mode": tokenizer_mode, "trust_remote_code": trust_remote_code, @@ -342,13 +342,14 @@ def make_detokenizer_process( proc = context.Process(target=DetokenizerProc.run_detokenizer, kwargs=process_kwargs) proc.start() - wait_for_startup(proc=proc, - ready_path=ready_path, - ready_str=DetokenizerProc.READY_STR, - timeout_ms=POLLING_TIMEOUT_MS) + + # Wait for startup. + if reader.recv()["status"] != "READY": + raise RuntimeError( + "Detokenizer initalization failed. See root cause above." + ) return BackgroundProcHandle(proc=proc, - ready_path=ready_path, input_path=input_path, output_path=output_path) @@ -512,8 +513,6 @@ def __init__(self, ) def shutdown(self): - self.proc_handle.proc.terminate() - self.proc_handle.proc.join(5) - - if self.proc_handle.proc.is_alive(): - kill_process_tree(self.proc_handle.proc.pid) + if hasattr(self, "proc_handle") and self.proc_handle: + self.proc_handle.shutdown() + self.proc_handle = None diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 492c9094f8307..f610c4c90d3dd 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,4 +1,5 @@ from multiprocessing.process import BaseProcess +from multiprocessing.connection import Connection from collections.abc import Sequence from contextlib import contextmanager @@ -125,27 +126,4 @@ def zmq_socket_ctx( finally: ctx.destroy(linger=0) - - -def wait_for_startup( - proc: BaseProcess, - ready_path: str, - ready_str: str, - timeout_ms: int, -) -> None: - """Wait until a background process is ready.""" - - with zmq_socket_ctx(ready_path, zmq.PULL) as socket: - try: - while socket.poll(timeout=timeout_ms) == 0: - logger.debug("Waiting for background proc to startup.") - - if not proc.is_alive(): - raise RuntimeError("Background process failed to start.") - - message = socket.recv_string() - assert message == ready_str - - except BaseException as e: - logger.exception(e) - raise e + From 467d63e9bba5f0ad7574461e0d334fe323a65151 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 19:47:50 +0000 Subject: [PATCH 108/132] actually save before commiting --- vllm/v1/engine/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f4edb1b4e12a2..59b66d0de11b9 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,7 +3,6 @@ import signal import threading import time -import os import weakref from typing import List, Optional, Tuple, Type @@ -18,7 +17,7 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext -from vllm.utils import get_exception_traceback, kill_process_tree +from vllm.utils import get_exception_traceback from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineAbortRequest, EngineRequest, From afd4b522d83502a7b317ec701a7b67c1389bd050 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 20:07:09 +0000 Subject: [PATCH 109/132] actually save before commiting --- vllm/v1/engine/__init__.py | 25 ------------------- vllm/v1/engine/async_llm.py | 4 ++- vllm/v1/engine/core.py | 31 ++++++------------------ vllm/v1/engine/llm_engine.py | 20 ++++++++------- vllm/v1/utils.py | 47 ++++++++++++++++++++++++++++++++++-- 5 files changed, 67 insertions(+), 60 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 849592d05b1b2..3af0219db1c14 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,6 +1,4 @@ -import os from dataclasses import dataclass -from multiprocessing.process import BaseProcess from typing import List, Optional, Union import msgspec @@ -8,29 +6,6 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.sampling_params import SamplingParams -from vllm.utils import kill_process_tree - -@dataclass -class BackgroundProcHandle: - proc: BaseProcess - input_path: str - output_path: str - - def shutdown(self): - # Shutdown the process if needed. - if self.proc.is_alive(): - self.proc.terminate() - self.proc.join(5) - - if self.proc.is_alive(): - kill_process_tree(self.proc.pid) - - # Remove zmq ipc socket files - ipc_sockets = [self.output_path, self.input_path] - for ipc_socket in ipc_sockets: - socket_file = ipc_socket.replace("ipc://", "") - if os and os.path.exists(socket_file): - os.remove(socket_file) @dataclass diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fe755df95cbe7..a585b9bcac80d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -186,10 +186,12 @@ async def add_request( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 2) Create Queue (output_handler() pushes, generate() pulls) + # 2) Create Queue (output_handler() pushes, generate() pulls). self.rid_to_queue[request_id] = asyncio.Queue() # 3) Send to Detokenizer (which forwards to EngineCore). + # Note: we forward the request rather than sending to each + # process separately to avoid race conditions in Detokenizer. await self.detokenizer_client.input_socket.send_pyobj(engine_request) return self.rid_to_queue[request_id] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 59b66d0de11b9..01b0f3a92a948 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,8 +3,7 @@ import signal import threading import time -import weakref -from typing import List, Optional, Tuple, Type +from typing import List, Tuple, Type import zmq import zmq.asyncio @@ -26,7 +25,8 @@ from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus -from vllm.v1.utils import zmq_socket_ctx +from vllm.v1.utils import (zmq_socket_ctx, BackgroundProcHandle, + MPBackgroundProcess) from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -330,31 +330,16 @@ def process_output_socket(self, output_path: str): encoder.encode_into(outputs, buffer) socket.send_multipart((buffer, ), copy=False) -class MPEngineCoreClient: - """ - MPEngineCoreClient: client for multi-proc EngineCore. - EngineCore runs in a background process busy loop, getting - new EngineRequests and returning EngineCoreOutputs - * pushes EngineRequests via input_socket - * pulls EngineCoreOutputs via output_socket - """ +class MPEngineCoreClient(MPBackgroundProcess): + """MPEngineCoreClient: client for multi-proc EngineCore.""" def __init__(self, *args, input_path: str, output_path: str, **kwargs): - # Start EngineCore in background process. - self.proc_handle: Optional[BackgroundProcHandle] - self.proc_handle = EngineCoreProc.make_engine_core_process( + super().__init__( *args, + fn=EngineCoreProc.make_engine_core_process, input_path=input_path, output_path=output_path, - **kwargs, ) - self._finalizer = weakref.finalize(self, self.shutdown) - def shutdown(self): - if hasattr(self, "proc_handle") and self.proc_handle: - self.proc_handle.shutdown() - self.proc_handle = None - - def __del__(self): - self.shutdown() + diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 9bd26ded88b53..caef661320cb2 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -173,14 +173,16 @@ def add_request( priority: int = 0, ) -> None: - # 1) Process raw inputs into the request. + # Process raw inputs into the request. engine_request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 2) Add to Detokenizer and EngineCore. + # Add to Detokenizer and EngineCore. if self.multiprocess_mode: # Send to Detokenizer (which forwards to EngineCore). + # Note: we forward the message rather than sending + # to each process separately to avoid race conditions. self.detokenizer_client.input_socket.send_pyobj(engine_request) else: # Add directly to Detokenizer and EngineCore. @@ -192,16 +194,13 @@ def step(self) -> List[RequestOutput]: if self.multiprocess_mode: # Get next output from the Detokenizer. return self.detokenizer_client.output_socket.recv_pyobj() - else: - # 1) Get EngineCoreOutput from the EngineCore. + # Step EngineCore and Detokenizer. engine_core_outputs = self.engine_core.step() - - # 2) Detokenizee the EngineCoreOutput. request_outputs, requests_to_abort = self.detokenizer.step( engine_core_outputs) - # 3) Abort requests that finished due to stopping criteria. + # Abort any requests that hit a stop string. if requests_to_abort: self.abort_request(requests_to_abort) @@ -236,5 +235,8 @@ def __del__(self): self.shutdown() def shutdown(self): - if engine_core := getattr(self, "engine_core", None): - engine_core.shutdown() + if engine_core_client := getattr(self, "engine_core_client", None): + engine_core_client.shutdown() + + if detokenizer_client := getattr(self, "detokenizer_client", None): + detokenizer_client.shutdown() diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index f610c4c90d3dd..bf0712c80c81d 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -4,7 +4,7 @@ from collections.abc import Sequence from contextlib import contextmanager from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union, - overload) + overload, Callable) import zmq import zmq.asyncio @@ -126,4 +126,47 @@ def zmq_socket_ctx( finally: ctx.destroy(linger=0) - + +from multiprocessing.process import BaseProcess +from vllm.utils import kill_process_tree +import os +import weakref +from dataclasses import dataclass + +@dataclass +class BackgroundProcHandle: + proc: BaseProcess + input_path: str + output_path: str + + def shutdown(self): + # Shutdown the process if needed. + if self.proc.is_alive(): + self.proc.terminate() + self.proc.join(5) + + if self.proc.is_alive(): + kill_process_tree(self.proc.pid) + + # Remove zmq ipc socket files + ipc_sockets = [self.output_path, self.input_path] + for ipc_socket in ipc_sockets: + socket_file = ipc_socket.replace("ipc://", "") + if os and os.path.exists(socket_file): + os.remove(socket_file) + + +class MPBackgroundProcess: + def __init__(self, *args, fn: Callable, input_path: str, output_path: str, **kwargs): + # Start EngineCore in background process. + self.proc_handle: Optional[BackgroundProcHandle] + self.proc_handle = fn(*args, input_path, output_path, kwargs) + self._finalizer = weakref.finalize(self, self.shutdown) + + def __del__(self): + self.shutdown() + + def shutdown(self): + if hasattr(self, "proc_handle") and self.proc_handle: + self.proc_handle.shutdown() + self.proc_handle = None \ No newline at end of file From 395742e92c7830970f914cc8f143b1f0e2061d7c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 20:07:19 +0000 Subject: [PATCH 110/132] again --- vllm/v1/engine/core.py | 3 +-- vllm/v1/utils.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 01b0f3a92a948..f1cde6c3085e9 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -20,8 +20,7 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineAbortRequest, EngineRequest, - EngineProfileRequest, EngineRequestUnion, - BackgroundProcHandle) + EngineProfileRequest, EngineRequestUnion) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index bf0712c80c81d..53a4c2dc949db 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -157,6 +157,7 @@ def shutdown(self): class MPBackgroundProcess: + def __init__(self, *args, fn: Callable, input_path: str, output_path: str, **kwargs): # Start EngineCore in background process. self.proc_handle: Optional[BackgroundProcHandle] From 2d6ceb87de498370b19f1b8cc682070d3235e17d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 20:57:08 +0000 Subject: [PATCH 111/132] updated --- vllm/v1/engine/async_llm.py | 43 +++++++++----- vllm/v1/engine/core.py | 73 ++++++++--------------- vllm/v1/engine/detokenizer.py | 107 +++++++++------------------------- vllm/v1/utils.py | 58 +++++++++++++----- 4 files changed, 125 insertions(+), 156 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a585b9bcac80d..85f4b29021be6 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -16,6 +16,9 @@ # Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py import asyncio +import zmq +import zmq.asyncio + from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -39,6 +42,7 @@ from vllm.v1.engine.detokenizer import MPDetokenizerClient from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor +from vllm.v1.utils import zmq_socket_ctx, make_zmq_socket logger = init_logger(__name__) @@ -87,9 +91,18 @@ def __init__( # IPC paths. from_engine_core_path = get_open_zmq_ipc_path() to_engine_core_path = get_open_zmq_ipc_path() + self.to_detokenizer_path = get_open_zmq_ipc_path() + self.from_detokenizer_path = get_open_zmq_ipc_path() + + # Detokenizer IPC. + self.ctx = zmq.asyncio.Context(io_threads=2) + self.to_detokenizer = make_zmq_socket( + self.ctx, self.to_detokenizer_path, zmq.PULL) # Detokenizer (background process). self.detokenizer_client = MPDetokenizerClient( + output_path=self.from_detokenizer_path, + input_path=self.to_detokenizer_path, from_engine_core_path=from_engine_core_path, to_engine_core_path=to_engine_core_path, tokenizer_name=vllm_config.model_config.tokenizer, @@ -145,6 +158,9 @@ def from_engine_args( def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" + if ctx := getattr(self, "ctx", None): + ctx.destroy(linger=0) + if output_handler := getattr(self, "output_hander", None): output_handler.cancel() @@ -192,7 +208,7 @@ async def add_request( # 3) Send to Detokenizer (which forwards to EngineCore). # Note: we forward the request rather than sending to each # process separately to avoid race conditions in Detokenizer. - await self.detokenizer_client.input_socket.send_pyobj(engine_request) + await self.to_detokenizer.send_pyobj(engine_request) return self.rid_to_queue[request_id] @@ -270,18 +286,19 @@ async def generate( async def output_handler_loop(self): """Background loop: pulls from Detokenizer and push to Queues.""" - while True: - # Note: use socket directly to avoid calling await multiple - # times, which causes too much task switching at high QPS. - outputs: List[RequestOutput] = [] - outputs = await self.detokenizer_client.output_socket.recv_pyobj() - - for out in outputs: - # Note: it is possible that a request was aborted - # due to client cancellation while EngineCoreOutputs - # are still flowing, so we just ignore. - if out.request_id in self.rid_to_queue: - self.rid_to_queue[out.request_id].put_nowait(out) + with zmq_socket_ctx(self.from_detokenizer_path, zmq.PULL) as socket: + while True: + # Note: use socket directly to avoid calling await multiple + # times, which causes too much task switching at high QPS. + outputs: List[RequestOutput] = [] + outputs = await socket.recv_pyobj() + + for out in outputs: + # Note: it is possible that a request was aborted + # due to client cancellation while EngineCoreOutputs + # are still flowing, so we just ignore. + if out.request_id in self.rid_to_queue: + self.rid_to_queue[out.request_id].put_nowait(out) async def abort(self, request_id: str): diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f1cde6c3085e9..9781ceaf04e56 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -24,8 +24,7 @@ from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus -from vllm.v1.utils import (zmq_socket_ctx, BackgroundProcHandle, - MPBackgroundProcess) +from vllm.v1.utils import zmq_socket_ctx, MPBackgroundProcess from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -136,6 +135,8 @@ def profile(self, is_start: bool = True): class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" + READY_STR = "READY" + def __init__( self, vllm_config: VllmConfig, @@ -148,10 +149,9 @@ def __init__( super().__init__(vllm_config, executor_class, usage_context) # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + # overlap ZMQ IO with GPU since they release the GIL and + # some serialization/deserialization with the model forward. + # Threads handle Socket <-> Queues and busy_loop uses Queues. self.input_queue: queue.Queue[EngineRequestUnion] = queue.Queue() self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue() threading.Thread(target=self.process_input_socket, @@ -162,42 +162,8 @@ def __init__( daemon=True).start() # Send Readiness signal to EngineClient. - ready_pipe.send({"status": "READY"}) - - @staticmethod - def make_engine_core_process( - vllm_config: VllmConfig, - executor_class: Type[Executor], - usage_context: UsageContext, - input_path: str, - output_path: str, - ) -> BackgroundProcHandle: - context = get_mp_context() - reader, writer = context.Pipe(duplex=False) - - process_kwargs = { - "input_path": input_path, - "output_path": output_path, - "ready_pipe": writer, - "vllm_config": vllm_config, - "executor_class": executor_class, - "usage_context": usage_context, - } - - # Run EngineCore busy loop in background process. - proc = context.Process(target=EngineCoreProc.run_engine_core, - kwargs=process_kwargs) - proc.start() - - # Wait for startup. - if reader.recv()["status"] != "READY": - raise RuntimeError( - "EngineCore initalization failed. See root cause above." - ) + ready_pipe.send({"status": EngineCoreProc.READY_STR}) - return BackgroundProcHandle(proc=proc, - input_path=input_path, - output_path=output_path) @staticmethod def run_engine_core(*args, **kwargs): @@ -331,14 +297,25 @@ def process_output_socket(self, output_path: str): class MPEngineCoreClient(MPBackgroundProcess): - """MPEngineCoreClient: client for multi-proc EngineCore.""" + """Client for multi-proc EngineCore.""" - def __init__(self, *args, input_path: str, output_path: str, **kwargs): - super().__init__( - *args, - fn=EngineCoreProc.make_engine_core_process, + def __init__(self, + input_path: str, + output_path: str, + vllm_config: VllmConfig, + executor_class: Type[Executor], + usage_context: UsageContext): + + super().__init__() + + self.proc_handle = MPBackgroundProcess.wait_for_startup( input_path=input_path, output_path=output_path, + process_name="EngineCore", + target_fn=EngineCoreProc.run_engine_core, + process_kwargs={ + "vllm_config": vllm_config, + "executor_class": executor_class, + "usage_context": usage_context, + }, ) - - diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index be9857f1f6aa0..455fb442f6c51 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -4,10 +4,10 @@ import msgspec import signal from dataclasses import dataclass +from multiprocessing.connection import Connection from typing import Dict, Iterable, List, Optional, Tuple,Union from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import RequestOutputKind @@ -17,10 +17,8 @@ from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, get_exception_traceback) from vllm.v1.engine import (EngineCoreOutputs, - BackgroundProcHandle, EngineRequest, EngineAbortRequest) -from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, - wait_for_startup) +from vllm.v1.utils import zmq_socket_ctx, MPBackgroundProcess logger = init_logger(__name__) @@ -298,8 +296,8 @@ def __init__( to_engine_core_path: str, input_path: str, output_path: str, - write_: str, - **kwargs + ready_pipe: Connection, + **kwargs, ): super().__init__(*args, **kwargs) @@ -308,50 +306,9 @@ def __init__( self.input_path = input_path self.output_path = output_path - # Send readiness signal. - with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket: - ready_socket.send_string(DetokenizerProc.READY_STR) + # Send Readiness signal to DetokenizerClient. + ready_pipe.send({"status": DetokenizerProc.READY_STR}) - - @staticmethod - def make_detokenizer_process( - from_engine_core_path: str, - to_engine_core_path: str, - input_path: str, - output_path: str, - tokenizer_name: str, - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - revision: Optional[str] = None, - ) -> BackgroundProcHandle: - context = get_mp_context() - reader, writer = context.Pipe(duplex=False) - - process_kwargs = { - "from_engine_core_path": from_engine_core_path, - "to_engine_core_path": to_engine_core_path, - "input_path": input_path, - "output_path": output_path, - "ready_pipe": writer, - "tokenizer_name": tokenizer_name, - "tokenizer_mode": tokenizer_mode, - "trust_remote_code": trust_remote_code, - "revision": revision, - } - # Run Detokenizer busy loop in background process. - proc = context.Process(target=DetokenizerProc.run_detokenizer, - kwargs=process_kwargs) - proc.start() - - # Wait for startup. - if reader.recv()["status"] != "READY": - raise RuntimeError( - "Detokenizer initalization failed. See root cause above." - ) - - return BackgroundProcHandle(proc=proc, - input_path=input_path, - output_path=output_path) @staticmethod def run_detokenizer(*args, **kwargs): @@ -475,44 +432,32 @@ def run_busy_loop(self): decoder=decoder, ) -class MPDetokenizerClient: +class MPDetokenizerClient(MPBackgroundProcess): + """Client for multi-proc Detokenizer.""" def __init__(self, - *args, + input_path: str, + output_path: str, from_engine_core_path: str, to_engine_core_path: str, - **kwargs): - - # ZMQ setup. - self.ctx = zmq.asyncio.Context(2) - - # Get input (DetokenizerRequest) to Detokenizer. - input_path = get_open_zmq_ipc_path() - self.input_socket = make_zmq_socket( - self.ctx, - input_path, - zmq.PUSH, - ) + tokenizer_name: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + revision: Optional[str] = None): - # Get output (RequestOutput) from Detokenizer. - output_path = get_open_zmq_ipc_path() - self.output_socket = make_zmq_socket(self.ctx, - output_path, - zmq.PULL, - ) + super().__init__() - # Start Detokenizer in background process. - self.proc_handle: Optional[BackgroundProcHandle] - self.proc_handle = DetokenizerProc.make_detokenizer_process( - *args, - from_engine_core_path=from_engine_core_path, - to_engine_core_path=to_engine_core_path, + self.proc_handle = MPBackgroundProcess.wait_for_startup( input_path=input_path, output_path=output_path, - **kwargs, + process_name="Detokenizer", + target_fn=DetokenizerProc.run_detokenizer, + process_kwargs={ + "from_engine_core_path": from_engine_core_path, + "to_engine_core_path": to_engine_core_path, + "tokenizer_name": tokenizer_name, + "tokenizer_mode": tokenizer_mode, + "trust_remote_code": trust_remote_code, + "revision": revision, + }, ) - - def shutdown(self): - if hasattr(self, "proc_handle") and self.proc_handle: - self.proc_handle.shutdown() - self.proc_handle = None diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 53a4c2dc949db..d52c343966353 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,15 +1,18 @@ +import os +import weakref +from dataclasses import dataclass from multiprocessing.process import BaseProcess -from multiprocessing.connection import Connection - from collections.abc import Sequence from contextlib import contextmanager -from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union, - overload, Callable) +from typing import (Any, Generic, Dict, Iterator, List, Optional, TypeVar, + Union, Callable, overload) import zmq import zmq.asyncio from vllm.logger import init_logger +from vllm.utils import kill_process_tree +from vllm.executor.multiproc_worker_utils import get_mp_context logger = init_logger(__name__) @@ -127,11 +130,6 @@ def zmq_socket_ctx( finally: ctx.destroy(linger=0) -from multiprocessing.process import BaseProcess -from vllm.utils import kill_process_tree -import os -import weakref -from dataclasses import dataclass @dataclass class BackgroundProcHandle: @@ -157,11 +155,11 @@ def shutdown(self): class MPBackgroundProcess: - - def __init__(self, *args, fn: Callable, input_path: str, output_path: str, **kwargs): - # Start EngineCore in background process. + + READY_STR = "READY" + + def __init__(self): self.proc_handle: Optional[BackgroundProcHandle] - self.proc_handle = fn(*args, input_path, output_path, kwargs) self._finalizer = weakref.finalize(self, self.shutdown) def __del__(self): @@ -170,4 +168,36 @@ def __del__(self): def shutdown(self): if hasattr(self, "proc_handle") and self.proc_handle: self.proc_handle.shutdown() - self.proc_handle = None \ No newline at end of file + self.proc_handle = None + + @staticmethod + def wait_for_startup( + input_path: str, + output_path: str, + process_name: str, + target_fn: Callable, + process_kwargs: Dict[Any, Any], + ) -> "MPBackgroundProcess": + context = get_mp_context() + reader, writer = context.Pipe(duplex=False) + + assert ("ready_pipe" not in process_kwargs and + "input_path" not in process_kwargs and + "output_path" not in process_kwargs) + process_kwargs["ready_pipe"] = writer + process_kwargs["input_path"] = input_path + process_kwargs["output_path"] = output_path + + # Run Detokenizer busy loop in background process. + proc = context.Process(target=target_fn, + kwargs=process_kwargs) + proc.start() + + # Wait for startup. + if reader.recv()["status"] != "READY": + raise RuntimeError( + f"{process_name} initalization failed. " + "See root cause above." + ) + + return BackgroundProcHandle(proc, input_path, output_path) From a19cb83c93ea7a53056f4a06fa2e87e988b5a381 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 20:58:39 +0000 Subject: [PATCH 112/132] cleanup --- vllm/entrypoints/openai/api_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ed3634a50e7e2..922f036a7f6b3 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -671,11 +671,11 @@ async def run_server(args, **uvicorn_kwargs) -> None: # workaround to ensure user has enough fds available for uvicorn + ipc set_ulimit() - def sigterm_handler(*_) -> None: + def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing raise KeyboardInterrupt("terminated") - signal.signal(signal.SIGTERM, sigterm_handler) + signal.signal(signal.SIGTERM, signal_handler) # The child processes will send SIGQUIT to this process when # any error happens. This process then clean up the whole tree. From 1695fddf712de610d020d3aa322bccaeff2dd2e5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 20:58:56 +0000 Subject: [PATCH 113/132] cleaning --- vllm/entrypoints/openai/api_server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 922f036a7f6b3..76d9a2bd714cd 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -684,7 +684,6 @@ def sigquit_handler(signum, frame): signal.signal(signal.SIGQUIT, sigquit_handler) - async with build_async_engine_client(args) as engine_client: app = build_app(args) From b2f845b1f60e241b58e1773f8d54d40dca29032d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 22:10:10 +0000 Subject: [PATCH 114/132] updated --- vllm/v1/engine/__init__.py | 10 +++++ vllm/v1/engine/async_llm.py | 67 +++++++++++++++++------------- vllm/v1/engine/core.py | 10 +++-- vllm/v1/engine/detokenizer.py | 76 +++++++++++++++-------------------- vllm/v1/utils.py | 8 ++-- 5 files changed, 91 insertions(+), 80 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 3af0219db1c14..a99f8a617fd8f 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,3 +1,4 @@ +import enum from dataclasses import dataclass from typing import List, Optional, Union @@ -60,3 +61,12 @@ class EngineCoreOutputs( # [num_reqs] outputs: List[EngineCoreOutput] + + +class EngineRequestType(enum.Enum): + """ + Request types defined as hex byte strings, so it can be sent over sockets + without separate encoding step. + """ + FROM_ENGINE_CORE = b'\x00' + FROM_ENGINE = b'\x01' diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 85f4b29021be6..acf5e028475cd 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -18,8 +18,9 @@ import asyncio import zmq import zmq.asyncio +import pickle -from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union +from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs @@ -37,12 +38,12 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import get_open_zmq_ipc_path -from vllm.v1.engine import EngineAbortRequest +from vllm.v1.engine import EngineAbortRequest, EngineRequestType from vllm.v1.engine.core import MPEngineCoreClient from vllm.v1.engine.detokenizer import MPDetokenizerClient from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.utils import zmq_socket_ctx, make_zmq_socket +from vllm.v1.utils import make_zmq_socket logger = init_logger(__name__) @@ -89,21 +90,22 @@ def __init__( ) # IPC paths. - from_engine_core_path = get_open_zmq_ipc_path() + to_detokenizer_path = get_open_zmq_ipc_path() to_engine_core_path = get_open_zmq_ipc_path() - self.to_detokenizer_path = get_open_zmq_ipc_path() - self.from_detokenizer_path = get_open_zmq_ipc_path() + to_llm_engine_path = get_open_zmq_ipc_path() + # Detokenizer IPC. self.ctx = zmq.asyncio.Context(io_threads=2) + self.from_detokenizer = make_zmq_socket( + self.ctx, to_llm_engine_path, zmq.PULL) self.to_detokenizer = make_zmq_socket( - self.ctx, self.to_detokenizer_path, zmq.PULL) - + self.ctx, to_detokenizer_path, zmq.PUSH) + # Detokenizer (background process). self.detokenizer_client = MPDetokenizerClient( - output_path=self.from_detokenizer_path, - input_path=self.to_detokenizer_path, - from_engine_core_path=from_engine_core_path, + output_path=to_llm_engine_path, + input_path=to_detokenizer_path, to_engine_core_path=to_engine_core_path, tokenizer_name=vllm_config.model_config.tokenizer, tokenizer_mode=vllm_config.model_config.tokenizer_mode, @@ -114,7 +116,7 @@ def __init__( # EngineCore (background process). self.engine_core_client = MPEngineCoreClient( input_path=to_engine_core_path, - output_path=from_engine_core_path, + output_path=to_detokenizer_path, vllm_config=vllm_config, executor_class=executor_class, usage_context=usage_context, @@ -207,8 +209,8 @@ async def add_request( # 3) Send to Detokenizer (which forwards to EngineCore). # Note: we forward the request rather than sending to each - # process separately to avoid race conditions in Detokenizer. - await self.to_detokenizer.send_pyobj(engine_request) + # process separately to avoid race conditions in Detokenizer). + await self.send_to_detokenizer(engine_request) return self.rid_to_queue[request_id] @@ -286,27 +288,28 @@ async def generate( async def output_handler_loop(self): """Background loop: pulls from Detokenizer and push to Queues.""" - with zmq_socket_ctx(self.from_detokenizer_path, zmq.PULL) as socket: - while True: - # Note: use socket directly to avoid calling await multiple - # times, which causes too much task switching at high QPS. - outputs: List[RequestOutput] = [] - outputs = await socket.recv_pyobj() - - for out in outputs: - # Note: it is possible that a request was aborted - # due to client cancellation while EngineCoreOutputs - # are still flowing, so we just ignore. - if out.request_id in self.rid_to_queue: - self.rid_to_queue[out.request_id].put_nowait(out) + epoch = 0 + while True: + logger.info(f"EPOCH: {epoch}") + epoch+=1 + # Note: use socket directly to avoid calling await multiple + # times, which causes too much task switching at high QPS. + outputs: List[RequestOutput] = [] + outputs = await self.from_detokenizer.recv_pyobj() + + for out in outputs: + # Note: it is possible that a request was aborted + # due to client cancellation while EngineCoreOutputs + # are still flowing, so we just ignore. + if out.request_id in self.rid_to_queue: + self.rid_to_queue[out.request_id].put_nowait(out) async def abort(self, request_id: str): """Abort request if the client cancels the request.""" # Send abort to Detokenizer (which will fwd to EngineCore). - await self.detokenizer_client.input_socket.send_pyobj( - EngineAbortRequest([request_id])) + await self.send_to_detokenizer(EngineAbortRequest([request_id])) # Remove from request output queues. if request_id in self.rid_to_queue: @@ -314,6 +317,12 @@ async def abort(self, request_id: str): if self.log_requests: logger.info("Aborted %s.", request_id) + + async def send_to_detokenizer(self, object: Any): + """Send object to Detokenizer with a FROM_ENGINE flag.""" + + msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object)) + await self.to_detokenizer.send_multipart(msg, copy=False) def encode( self, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 9781ceaf04e56..151cafbe62ae1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -20,7 +20,8 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineAbortRequest, EngineRequest, - EngineProfileRequest, EngineRequestUnion) + EngineRequestType, EngineProfileRequest, + EngineRequestUnion) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -162,7 +163,7 @@ def __init__( daemon=True).start() # Send Readiness signal to EngineClient. - ready_pipe.send({"status": EngineCoreProc.READY_STR}) + ready_pipe.send({"status": "READY"}) @staticmethod @@ -226,6 +227,7 @@ def run_busy_loop(self): self._handle_client_request(req) break except queue.Empty: + logger.info(f"EPOCH: {epoch}") self._log_stats() logger.debug("EngineCore busy loop waiting.") except BaseException: @@ -293,7 +295,8 @@ def process_output_socket(self, output_path: str): engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) encoder.encode_into(outputs, buffer) - socket.send_multipart((buffer, ), copy=False) + msg = (EngineRequestType.FROM_ENGINE_CORE.value, buffer) + socket.send_multipart(msg, copy=False) class MPEngineCoreClient(MPBackgroundProcess): @@ -319,3 +322,4 @@ def __init__(self, "usage_context": usage_context, }, ) + print("STARTED ENGINE CORE") diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 455fb442f6c51..0511b8ccdfa12 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -14,11 +14,10 @@ from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, - get_exception_traceback) -from vllm.v1.engine import (EngineCoreOutputs, +from vllm.utils import get_exception_traceback +from vllm.v1.engine import (EngineCoreOutputs, EngineRequestType, EngineRequest, EngineAbortRequest) -from vllm.v1.utils import zmq_socket_ctx, MPBackgroundProcess +from vllm.v1.utils import make_zmq_socket, MPBackgroundProcess logger = init_logger(__name__) @@ -287,27 +286,23 @@ def step( class DetokenizerProc(Detokenizer): """ZMQ-wrapper for running Detokenizer in background process.""" - READY_STR = "READY" - def __init__( self, *args, - from_engine_core_path: str, - to_engine_core_path: str, input_path: str, output_path: str, + to_engine_core_path: str, ready_pipe: Connection, **kwargs, ): super().__init__(*args, **kwargs) - self.from_engine_core_path = from_engine_core_path - self.to_engine_core_path = to_engine_core_path self.input_path = input_path self.output_path = output_path + self.to_engine_core_path = to_engine_core_path # Send Readiness signal to DetokenizerClient. - ready_pipe.send({"status": DetokenizerProc.READY_STR}) + ready_pipe.send({"status": "READY"}) @staticmethod @@ -350,13 +345,12 @@ def signal_handler(signum, frame): def _handle_from_llm_engine( self, - from_llm_engine: zmq.Socket, + request_bytes: bytes, to_engine_core: zmq.Socket, ) -> None: """Handle EngineRequest from the LLMEngine.""" - pickled_req = from_llm_engine.recv() - req = pickle.loads(pickled_req) + req = pickle.loads(request_bytes) if isinstance(req, EngineRequest): self.add_request(req) @@ -366,11 +360,11 @@ def _handle_from_llm_engine( raise ValueError(f"Unknown type: {req}") # Forward to EngineCore. - to_engine_core.send(pickled_req) + to_engine_core.send(request_bytes) def _handle_from_engine_core( self, - from_engine_core: zmq.Socket, + output_bytes: bytes, to_engine_core: zmq.Socket, to_llm_engine: zmq.Socket, decoder: msgspec.msgpack.Decoder, @@ -378,8 +372,7 @@ def _handle_from_engine_core( """Handle Outputs from the EngineCore.""" # Deserialize the EngineOutput (use msgpack for performance). - (frame, ) = from_engine_core.recv_multipart(copy=False) - outputs: EngineCoreOutputs = decoder.decode(frame.buffer) + outputs: EngineCoreOutputs = decoder.decode(output_bytes) # Detokenize. request_outputs, requests_to_abort = self.step(outputs) @@ -398,39 +391,37 @@ def run_busy_loop(self): decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) - with (zmq_socket_ctx(self.from_engine_core_path, zmq.PULL) as from_engine_core, - zmq_socket_ctx(self.to_engine_core_path, zmq.PUSH) as to_engine_core, - zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine, - zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine): - - # TODO(rob): avoid poll by having both EngineCore and - # LLMEngine send to the same socket. - poller = zmq.Poller() - poller.register(from_engine_core, zmq.POLLIN) - poller.register(from_llm_engine, zmq.POLLIN) - + ctx = zmq.Context(io_threads=2) + try: + input_socket = make_zmq_socket(ctx, self.input_path, zmq.PULL) + to_llm_engine = make_zmq_socket(ctx, self.output_path, zmq.PUSH) + to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, zmq.PUSH) epoch = 0 while True: - logger.info(f"EPOCH: {epoch}") + (msg_type, msg_bytes) = input_socket.recv_multipart() - socks = dict(poller.poll()) + # Handle message from LLMEngine (Abort or New Request). + if msg_type == EngineRequestType.FROM_ENGINE.value: + self._handle_from_llm_engine(msg_bytes, to_engine_core) - # Handle input from LLMEngine. - if from_llm_engine in socks: - self._handle_from_llm_engine( - from_llm_engine=from_llm_engine, - to_engine_core=to_engine_core, - ) - - # Handle output from EngineCoreOutput. - if from_engine_core in socks: + # Handle message from EngineCore (EngineCoreOutputs). + elif msg_type == EngineRequestType.FROM_ENGINE_CORE.value: epoch += 1 self._handle_from_engine_core( - from_engine_core=from_engine_core, + output_bytes=msg_bytes, to_engine_core=to_engine_core, to_llm_engine=to_llm_engine, decoder=decoder, ) + else: + raise ValueError(f"Unknown Message Type: {msg_type}") + + except KeyboardInterrupt: + logger.debug("Got Keyboard Interrupt.") + + finally: + ctx.destroy(linger=0) + class MPDetokenizerClient(MPBackgroundProcess): """Client for multi-proc Detokenizer.""" @@ -438,7 +429,6 @@ class MPDetokenizerClient(MPBackgroundProcess): def __init__(self, input_path: str, output_path: str, - from_engine_core_path: str, to_engine_core_path: str, tokenizer_name: str, tokenizer_mode: str = "auto", @@ -453,7 +443,6 @@ def __init__(self, process_name="Detokenizer", target_fn=DetokenizerProc.run_detokenizer, process_kwargs={ - "from_engine_core_path": from_engine_core_path, "to_engine_core_path": to_engine_core_path, "tokenizer_name": tokenizer_name, "tokenizer_mode": tokenizer_mode, @@ -461,3 +450,4 @@ def __init__(self, "revision": revision, }, ) + print("STARTED DETOKENIZER") diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index d52c343966353..508474ea53f57 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -103,11 +103,11 @@ def make_zmq_socket(ctx: Union[zmq.asyncio.Context, zmq.Context], path: str, if type == zmq.PULL: socket.setsockopt(zmq.RCVHWM, 0) socket.setsockopt(zmq.RCVBUF, buf_size) - socket.connect(path) + socket.bind(path) elif type == zmq.PUSH: socket.setsockopt(zmq.SNDHWM, 0) socket.setsockopt(zmq.SNDBUF, buf_size) - socket.bind(path) + socket.connect(path) else: raise ValueError(f"Unknown Socket Type: {type}") @@ -125,7 +125,7 @@ def zmq_socket_ctx( yield make_zmq_socket(ctx, path, type) except KeyboardInterrupt: - logger.debug("Worker had Keyboard Interrupt.") + logger.debug("Got Keyboard Interrupt.") finally: ctx.destroy(linger=0) @@ -156,8 +156,6 @@ def shutdown(self): class MPBackgroundProcess: - READY_STR = "READY" - def __init__(self): self.proc_handle: Optional[BackgroundProcHandle] self._finalizer = weakref.finalize(self, self.shutdown) From 12df407bd991e6255bd1bea37d82ead4e7ad248a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 22:13:15 +0000 Subject: [PATCH 115/132] remove epoch --- vllm/v1/engine/async_llm.py | 3 --- vllm/v1/engine/core.py | 7 ------- vllm/v1/engine/detokenizer.py | 2 -- 3 files changed, 12 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index acf5e028475cd..4e791e8f06565 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -288,10 +288,7 @@ async def generate( async def output_handler_loop(self): """Background loop: pulls from Detokenizer and push to Queues.""" - epoch = 0 while True: - logger.info(f"EPOCH: {epoch}") - epoch+=1 # Note: use socket directly to avoid calling await multiple # times, which causes too much task switching at high QPS. outputs: List[RequestOutput] = [] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 151cafbe62ae1..e4339222f7539 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -212,13 +212,7 @@ def run_busy_loop(self): """Core busy loop of the EngineCore.""" # Loop until process is sent a SIGINT or SIGTERM - epoch = 0 while True: - logger.info(f"EPOCH: {epoch}") - epoch += 1 - # if epoch == 10: - # raise ValueError("Died") - # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): while True: @@ -227,7 +221,6 @@ def run_busy_loop(self): self._handle_client_request(req) break except queue.Empty: - logger.info(f"EPOCH: {epoch}") self._log_stats() logger.debug("EngineCore busy loop waiting.") except BaseException: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 0511b8ccdfa12..870c4a7501a36 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -396,7 +396,6 @@ def run_busy_loop(self): input_socket = make_zmq_socket(ctx, self.input_path, zmq.PULL) to_llm_engine = make_zmq_socket(ctx, self.output_path, zmq.PUSH) to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, zmq.PUSH) - epoch = 0 while True: (msg_type, msg_bytes) = input_socket.recv_multipart() @@ -406,7 +405,6 @@ def run_busy_loop(self): # Handle message from EngineCore (EngineCoreOutputs). elif msg_type == EngineRequestType.FROM_ENGINE_CORE.value: - epoch += 1 self._handle_from_engine_core( output_bytes=msg_bytes, to_engine_core=to_engine_core, From b7843c93060c7684c37f6117261ab7a1e0df6d05 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 22:17:46 +0000 Subject: [PATCH 116/132] update --- vllm/v1/engine/__init__.py | 6 +++--- vllm/v1/engine/async_llm.py | 22 ++++++++++------------ vllm/v1/engine/core.py | 17 ++++++----------- vllm/v1/engine/detokenizer.py | 26 ++++++++++++++------------ vllm/v1/engine/llm_engine.py | 6 +++--- vllm/v1/utils.py | 19 ++++++++----------- 6 files changed, 44 insertions(+), 52 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index a99f8a617fd8f..0e104118c9ff9 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -63,10 +63,10 @@ class EngineCoreOutputs( outputs: List[EngineCoreOutput] -class EngineRequestType(enum.Enum): +class EngineRequestType(enum.Enum): """ Request types defined as hex byte strings, so it can be sent over sockets without separate encoding step. - """ + """ FROM_ENGINE_CORE = b'\x00' - FROM_ENGINE = b'\x01' + FROM_ENGINE = b'\x01' diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 4e791e8f06565..7a682f79e7972 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -20,7 +20,8 @@ import zmq.asyncio import pickle -from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Type, Union +from typing import (Any, AsyncGenerator, Dict, List, Mapping, Optional, Type, + Union) from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs @@ -93,15 +94,14 @@ def __init__( to_detokenizer_path = get_open_zmq_ipc_path() to_engine_core_path = get_open_zmq_ipc_path() to_llm_engine_path = get_open_zmq_ipc_path() - # Detokenizer IPC. self.ctx = zmq.asyncio.Context(io_threads=2) - self.from_detokenizer = make_zmq_socket( - self.ctx, to_llm_engine_path, zmq.PULL) - self.to_detokenizer = make_zmq_socket( - self.ctx, to_detokenizer_path, zmq.PUSH) - + self.from_detokenizer = make_zmq_socket(self.ctx, to_llm_engine_path, + zmq.PULL) + self.to_detokenizer = make_zmq_socket(self.ctx, to_detokenizer_path, + zmq.PUSH) + # Detokenizer (background process). self.detokenizer_client = MPDetokenizerClient( output_path=to_llm_engine_path, @@ -162,7 +162,7 @@ def shutdown(self): if ctx := getattr(self, "ctx", None): ctx.destroy(linger=0) - + if output_handler := getattr(self, "output_hander", None): output_handler.cancel() @@ -278,13 +278,12 @@ async def generate( yield out # Client request cancellation is handled through calling - # task.cancel() on generate(). Calling self.abort() forwards the + # task.cancel() on generate(). Calling self.abort() forwards the # cancellation to the EngineCore and Detokenizer. except asyncio.CancelledError: await self.abort(request_id) raise - async def output_handler_loop(self): """Background loop: pulls from Detokenizer and push to Queues.""" @@ -300,7 +299,6 @@ async def output_handler_loop(self): # are still flowing, so we just ignore. if out.request_id in self.rid_to_queue: self.rid_to_queue[out.request_id].put_nowait(out) - async def abort(self, request_id: str): """Abort request if the client cancels the request.""" @@ -314,7 +312,7 @@ async def abort(self, request_id: str): if self.log_requests: logger.info("Aborted %s.", request_id) - + async def send_to_detokenizer(self, object: Any): """Send object to Detokenizer with a FROM_ENGINE flag.""" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e4339222f7539..a06fef170e8d5 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -11,7 +11,6 @@ from multiprocessing.connection import Connection from vllm.config import CacheConfig, VllmConfig -from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) @@ -108,8 +107,8 @@ def add_request(self, request: EngineRequest): def abort_requests(self, request_ids: List[str]): """Abort requests from the scheduler.""" - # TODO: The scheduler doesn't really need to know the - # specific finish reason, TBD whether we propagate that + # TODO: The scheduler doesn't really need to know the + # specific finish reason, TBD whether we propagate that # (i.e. client-aborted vs stop criteria met). self.scheduler.finish_requests(request_ids, RequestStatus.FINISHED_ABORTED) @@ -150,7 +149,7 @@ def __init__( super().__init__(vllm_config, executor_class, usage_context) # Background Threads and Queues for IO. These enable us to - # overlap ZMQ IO with GPU since they release the GIL and + # overlap ZMQ IO with GPU since they release the GIL and # some serialization/deserialization with the model forward. # Threads handle Socket <-> Queues and busy_loop uses Queues. self.input_queue: queue.Queue[EngineRequestUnion] = queue.Queue() @@ -165,7 +164,6 @@ def __init__( # Send Readiness signal to EngineClient. ready_pipe.send({"status": "READY"}) - @staticmethod def run_engine_core(*args, **kwargs): """Launch EngineCore busy loop in background process.""" @@ -226,7 +224,7 @@ def run_busy_loop(self): except BaseException: raise - # 2) Handle any new inputs. + # 2) Handle any new client requests (Abort or Add). while not self.input_queue.empty(): req = self.input_queue.get_nowait() self._handle_client_request(req) @@ -295,11 +293,8 @@ def process_output_socket(self, output_path: str): class MPEngineCoreClient(MPBackgroundProcess): """Client for multi-proc EngineCore.""" - def __init__(self, - input_path: str, - output_path: str, - vllm_config: VllmConfig, - executor_class: Type[Executor], + def __init__(self, input_path: str, output_path: str, + vllm_config: VllmConfig, executor_class: Type[Executor], usage_context: UsageContext): super().__init__() diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 870c4a7501a36..384e1a69170af 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -5,7 +5,7 @@ import signal from dataclasses import dataclass from multiprocessing.connection import Connection -from typing import Dict, Iterable, List, Optional, Tuple,Union +from typing import Dict, Iterable, List, Optional, Tuple, Union from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger @@ -23,6 +23,7 @@ POLLING_TIMEOUT_MS = 5000 + @dataclass class IncrementalDetokenizer: @@ -90,7 +91,8 @@ def from_new_request( # NOTE(Nick): could we take ownership of it though? token_ids=request.prompt_token_ids.copy(), stop=stops, - include_stop_str_in_output=sampling_params.include_stop_str_in_output, + include_stop_str_in_output=sampling_params. + include_stop_str_in_output, prefix_offset=prefix_offset, read_offset=read_offset, skip_special_tokens=sampling_params.skip_special_tokens, @@ -247,7 +249,8 @@ def add_request( self.request_states[request.request_id] = request_state def step( - self, encore_core_outputs: EngineCoreOutputs, + self, + encore_core_outputs: EngineCoreOutputs, ) -> Tuple[List[RequestOutput], List[str]]: """Update state and make RequestOutputs for the LLMEngine.""" @@ -283,6 +286,7 @@ def step( # Return to EngineClient. return request_outputs, requests_to_abort + class DetokenizerProc(Detokenizer): """ZMQ-wrapper for running Detokenizer in background process.""" @@ -304,7 +308,6 @@ def __init__( # Send Readiness signal to DetokenizerClient. ready_pipe.send({"status": "READY"}) - @staticmethod def run_detokenizer(*args, **kwargs): """Launch Detokenizer busy loop in background process.""" @@ -336,7 +339,7 @@ def signal_handler(signum, frame): except Exception: traceback = get_exception_traceback() - logger.error(f"Detokenizer hit an exception: {traceback}") + logger.error("Detokenizer hit an exception: %s", traceback) parent_process.send_signal(signal.SIGQUIT) finally: @@ -344,7 +347,7 @@ def signal_handler(signum, frame): detokenizer = None def _handle_from_llm_engine( - self, + self, request_bytes: bytes, to_engine_core: zmq.Socket, ) -> None: @@ -361,7 +364,7 @@ def _handle_from_llm_engine( # Forward to EngineCore. to_engine_core.send(request_bytes) - + def _handle_from_engine_core( self, output_bytes: bytes, @@ -382,9 +385,7 @@ def _handle_from_engine_core( # Abort requests that finished due to stop strings. if len(requests_to_abort) > 0: - to_engine_core.send_pyobj( - EngineAbortRequest(requests_to_abort)) - + to_engine_core.send_pyobj(EngineAbortRequest(requests_to_abort)) def run_busy_loop(self): """Core busy loop of the Detokenizer.""" @@ -395,7 +396,8 @@ def run_busy_loop(self): try: input_socket = make_zmq_socket(ctx, self.input_path, zmq.PULL) to_llm_engine = make_zmq_socket(ctx, self.output_path, zmq.PUSH) - to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, zmq.PUSH) + to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, + zmq.PUSH) while True: (msg_type, msg_bytes) = input_socket.recv_multipart() @@ -423,7 +425,7 @@ def run_busy_loop(self): class MPDetokenizerClient(MPBackgroundProcess): """Client for multi-proc Detokenizer.""" - + def __init__(self, input_path: str, output_path: str, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index caef661320cb2..be660a4023b30 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -86,7 +86,7 @@ def __init__( executor_class=executor_class, usage_context=usage_context, ) - + else: # Detokenizer (in process). self.detokenizer = Detokenizer( @@ -190,7 +190,7 @@ def add_request( self.engine_core.add_request(engine_request) def step(self) -> List[RequestOutput]: - + if self.multiprocess_mode: # Get next output from the Detokenizer. return self.detokenizer_client.output_socket.recv_pyobj() @@ -203,7 +203,7 @@ def step(self) -> List[RequestOutput]: # Abort any requests that hit a stop string. if requests_to_abort: self.abort_request(requests_to_abort) - + return request_outputs def get_model_config(self): diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 508474ea53f57..fde4601361256 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -4,7 +4,7 @@ from multiprocessing.process import BaseProcess from collections.abc import Sequence from contextlib import contextmanager -from typing import (Any, Generic, Dict, Iterator, List, Optional, TypeVar, +from typing import (Any, Generic, Dict, Iterator, List, Optional, TypeVar, Union, Callable, overload) import zmq @@ -179,23 +179,20 @@ def wait_for_startup( context = get_mp_context() reader, writer = context.Pipe(duplex=False) - assert ("ready_pipe" not in process_kwargs and - "input_path" not in process_kwargs and - "output_path" not in process_kwargs) + assert ("ready_pipe" not in process_kwargs + and "input_path" not in process_kwargs + and "output_path" not in process_kwargs) process_kwargs["ready_pipe"] = writer process_kwargs["input_path"] = input_path process_kwargs["output_path"] = output_path # Run Detokenizer busy loop in background process. - proc = context.Process(target=target_fn, - kwargs=process_kwargs) + proc = context.Process(target=target_fn, kwargs=process_kwargs) proc.start() - + # Wait for startup. if reader.recv()["status"] != "READY": - raise RuntimeError( - f"{process_name} initalization failed. " - "See root cause above." - ) + raise RuntimeError(f"{process_name} initalization failed. " + "See root cause above.") return BackgroundProcHandle(proc, input_path, output_path) From a6368a7826579ab492dd8da4b20c78701af82918 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 22:24:00 +0000 Subject: [PATCH 117/132] fix typing --- vllm/v1/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index fde4601361256..1544dd104c434 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -175,7 +175,7 @@ def wait_for_startup( process_name: str, target_fn: Callable, process_kwargs: Dict[Any, Any], - ) -> "MPBackgroundProcess": + ) -> BackgroundProcHandle: context = get_mp_context() reader, writer = context.Pipe(duplex=False) From 315efeadba9ebf407227f40c9d5ef8ee2ae3e271 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 22:25:28 +0000 Subject: [PATCH 118/132] remove prints --- vllm/v1/engine/core.py | 1 - vllm/v1/engine/detokenizer.py | 1 - 2 files changed, 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a06fef170e8d5..136801ffbd617 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -310,4 +310,3 @@ def __init__(self, input_path: str, output_path: str, "usage_context": usage_context, }, ) - print("STARTED ENGINE CORE") diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 384e1a69170af..ed44ad1ffb318 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -450,4 +450,3 @@ def __init__(self, "revision": revision, }, ) - print("STARTED DETOKENIZER") From 740567fbcecce5afc0f7779e1a06618e6d02b0e7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 22:30:07 +0000 Subject: [PATCH 119/132] updated --- vllm/v1/engine/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 995492ad85df2..72d4a1ecf4511 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,5 +1,5 @@ import time -from typing import Mapping, Optional, Tuple, Union +from typing import Mapping, Optional, Union from vllm.config import CacheConfig, LoRAConfig, ModelConfig from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, From cbc043ede0390474a007a87aed2deb28464b4b4f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 23:05:47 +0000 Subject: [PATCH 120/132] fixup git --- vllm/v1/engine/async_llm.py | 8 +------- vllm/v1/engine/detokenizer.py | 2 +- vllm/v1/engine/llm_engine.py | 33 ++++++++++++++++++++++++++------- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7a682f79e7972..9a3a5530cb052 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -39,7 +39,7 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import get_open_zmq_ipc_path -from vllm.v1.engine import EngineAbortRequest, EngineRequestType +from vllm.v1.engine import EngineAbortRequest from vllm.v1.engine.core import MPEngineCoreClient from vllm.v1.engine.detokenizer import MPDetokenizerClient from vllm.v1.engine.processor import Processor @@ -313,12 +313,6 @@ async def abort(self, request_id: str): if self.log_requests: logger.info("Aborted %s.", request_id) - async def send_to_detokenizer(self, object: Any): - """Send object to Detokenizer with a FROM_ENGINE flag.""" - - msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object)) - await self.to_detokenizer.send_multipart(msg, copy=False) - def encode( self, prompt: PromptType, diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index ed44ad1ffb318..b54cdd80db4ae 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -5,7 +5,7 @@ import signal from dataclasses import dataclass from multiprocessing.connection import Connection -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index be660a4023b30..a7223ae3571ba 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,7 +1,9 @@ -from typing import Dict, List, Mapping, Optional, Type, Union - +from typing import Any, Dict, List, Mapping, Optional, Type, Union from typing_extensions import TypeVar +import zmq +import pickle + from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase @@ -18,10 +20,12 @@ BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext from vllm.utils import get_open_zmq_ipc_path +from vllm.v1.engine import EngineRequestType from vllm.v1.engine.core import EngineCore, MPEngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer, MPDetokenizerClient from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor +from vllm.v1.utils import make_zmq_socket logger = init_logger(__name__) @@ -65,12 +69,21 @@ def __init__( if self.multiprocess_mode: # IPC paths. - from_engine_core_path = get_open_zmq_ipc_path() + to_detokenizer_path = get_open_zmq_ipc_path() to_engine_core_path = get_open_zmq_ipc_path() + to_llm_engine_path = get_open_zmq_ipc_path() + + # Detokenizer IPC. + self.ctx = zmq.Context(io_threads=2) + self.from_detokenizer = make_zmq_socket( + self.ctx, to_llm_engine_path, zmq.PULL) + self.to_detokenizer = make_zmq_socket( + self.ctx, to_detokenizer_path, zmq.PUSH) # Detokenizer (background process). self.detokenizer_client = MPDetokenizerClient( - from_engine_core_path=from_engine_core_path, + output_path=to_llm_engine_path, + input_path=to_detokenizer_path, to_engine_core_path=to_engine_core_path, tokenizer_name=vllm_config.model_config.tokenizer, tokenizer_mode=vllm_config.model_config.tokenizer_mode, @@ -81,7 +94,7 @@ def __init__( # EngineCore (background process). self.engine_core_client = MPEngineCoreClient( input_path=to_engine_core_path, - output_path=from_engine_core_path, + output_path=to_detokenizer_path, vllm_config=vllm_config, executor_class=executor_class, usage_context=usage_context, @@ -183,7 +196,7 @@ def add_request( # Send to Detokenizer (which forwards to EngineCore). # Note: we forward the message rather than sending # to each process separately to avoid race conditions. - self.detokenizer_client.input_socket.send_pyobj(engine_request) + self.send_to_detokenizer(engine_request) else: # Add directly to Detokenizer and EngineCore. self.detokenizer.add_request(engine_request) @@ -193,7 +206,7 @@ def step(self) -> List[RequestOutput]: if self.multiprocess_mode: # Get next output from the Detokenizer. - return self.detokenizer_client.output_socket.recv_pyobj() + return self.from_detokenizer.recv_pyobj() else: # Step EngineCore and Detokenizer. engine_core_outputs = self.engine_core.step() @@ -206,6 +219,12 @@ def step(self) -> List[RequestOutput]: return request_outputs + def send_to_detokenizer(self, object: Any): + """Send object to Detokenizer with a FROM_ENGINE flag.""" + + msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object)) + self.to_detokenizer.send_multipart(msg, copy=False) + def get_model_config(self): return self.model_config From 80610784ce4840d5ad72a9b557c8a468b6a2fc17 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 23:16:32 +0000 Subject: [PATCH 121/132] mypy --- vllm/v1/engine/async_llm.py | 4 ++-- vllm/v1/engine/core.py | 4 ++-- vllm/v1/engine/detokenizer.py | 27 ++++++++++++++------------ vllm/v1/engine/llm_engine.py | 12 +++++++----- vllm/v1/executor/multiproc_executor.py | 4 ++-- vllm/v1/utils.py | 19 ++++++++++-------- 6 files changed, 39 insertions(+), 31 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 9a3a5530cb052..80ea0dc75234f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -98,9 +98,9 @@ def __init__( # Detokenizer IPC. self.ctx = zmq.asyncio.Context(io_threads=2) self.from_detokenizer = make_zmq_socket(self.ctx, to_llm_engine_path, - zmq.PULL) + zmq.constants.PULL) self.to_detokenizer = make_zmq_socket(self.ctx, to_detokenizer_path, - zmq.PUSH) + zmq.constants.PUSH) # Detokenizer (background process). self.detokenizer_client = MPDetokenizerClient( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 136801ffbd617..6ce4a6621080e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -267,7 +267,7 @@ def _handle_client_request(self, request: EngineRequestUnion) -> None: def process_input_socket(self, input_path: str): """Input socket IO thread.""" - with zmq_socket_ctx(input_path, zmq.PULL) as socket: + with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket: while True: # Push to input queue for core busy loop. request = socket.recv_pyobj() @@ -281,7 +281,7 @@ def process_output_socket(self, output_path: str): # Reuse send buffer. buffer = bytearray() - with zmq_socket_ctx(output_path, zmq.PUSH) as socket: + with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index b54cdd80db4ae..c3d3e7db6a338 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -15,8 +15,9 @@ AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import get_exception_traceback -from vllm.v1.engine import (EngineCoreOutputs, EngineRequestType, - EngineRequest, EngineAbortRequest) +from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, + EngineRequestType, EngineRequest, + EngineAbortRequest) from vllm.v1.utils import make_zmq_socket, MPBackgroundProcess logger = init_logger(__name__) @@ -250,13 +251,13 @@ def add_request( def step( self, - encore_core_outputs: EngineCoreOutputs, + encore_core_outputs: List[EngineCoreOutput], ) -> Tuple[List[RequestOutput], List[str]]: """Update state and make RequestOutputs for the LLMEngine.""" request_outputs: List[RequestOutput] = [] requests_to_abort: List[str] = [] - for engine_core_output in encore_core_outputs.outputs: + for engine_core_output in encore_core_outputs: request_id = engine_core_output.request_id detokenizer = self.request_states.get(request_id) if detokenizer is None: @@ -349,7 +350,7 @@ def signal_handler(signum, frame): def _handle_from_llm_engine( self, request_bytes: bytes, - to_engine_core: zmq.Socket, + to_engine_core: zmq.constants.Socket, ) -> None: """Handle EngineRequest from the LLMEngine.""" @@ -368,14 +369,14 @@ def _handle_from_llm_engine( def _handle_from_engine_core( self, output_bytes: bytes, - to_engine_core: zmq.Socket, - to_llm_engine: zmq.Socket, + to_engine_core: zmq.constants.Socket, + to_llm_engine: zmq.constants.Socket, decoder: msgspec.msgpack.Decoder, ) -> None: """Handle Outputs from the EngineCore.""" # Deserialize the EngineOutput (use msgpack for performance). - outputs: EngineCoreOutputs = decoder.decode(output_bytes) + outputs: List[EngineCoreOutput] = decoder.decode(output_bytes).outputs # Detokenize. request_outputs, requests_to_abort = self.step(outputs) @@ -392,12 +393,14 @@ def run_busy_loop(self): decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) - ctx = zmq.Context(io_threads=2) + ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] try: - input_socket = make_zmq_socket(ctx, self.input_path, zmq.PULL) - to_llm_engine = make_zmq_socket(ctx, self.output_path, zmq.PUSH) + input_socket = make_zmq_socket(ctx, self.input_path, + zmq.constants.PULL) + to_llm_engine = make_zmq_socket(ctx, self.output_path, + zmq.constants.PUSH) to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, - zmq.PUSH) + zmq.constants.PUSH) while True: (msg_type, msg_bytes) = input_socket.recv_multipart() diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index a7223ae3571ba..efddf3a049cf5 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -74,11 +74,13 @@ def __init__( to_llm_engine_path = get_open_zmq_ipc_path() # Detokenizer IPC. - self.ctx = zmq.Context(io_threads=2) - self.from_detokenizer = make_zmq_socket( - self.ctx, to_llm_engine_path, zmq.PULL) - self.to_detokenizer = make_zmq_socket( - self.ctx, to_detokenizer_path, zmq.PUSH) + self.ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] + self.from_detokenizer = make_zmq_socket(self.ctx, + to_llm_engine_path, + zmq.constants.PULL) + self.to_detokenizer = make_zmq_socket(self.ctx, + to_detokenizer_path, + zmq.constants.PUSH) # Detokenizer (background process). self.detokenizer_client = MPDetokenizerClient( diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 6164a12cda770..78509b9cc6a08 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -250,7 +250,7 @@ def __init__( worker_response_mq_handle = self.worker_response_mq.export_handle() # Send Readiness signal to EngineCore process. - with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket: + with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: payload = pickle.dumps(worker_response_mq_handle, protocol=pickle.HIGHEST_PROTOCOL) ready_socket.send_string(WorkerProc.READY_STR) @@ -352,7 +352,7 @@ def wait_for_startup( ready_path: str, ) -> Optional[Handle]: """Wait until the Worker is ready.""" - with zmq_socket_ctx(ready_path, zmq.PULL) as socket: + with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: # Wait for Worker to send READY. while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 1544dd104c434..ce0bccfa40f66 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -84,8 +84,11 @@ def __len__(self): return len(self._x) -def make_zmq_socket(ctx: Union[zmq.asyncio.Context, zmq.Context], path: str, - type: Any) -> Union[zmq.Socket, zmq.asyncio.Socket]: +def make_zmq_socket( + ctx: Union[zmq.asyncio.Context, zmq.Context], # type: ignore[name-defined] + path: str, + type: Any, +) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] """Make a ZMQ socket with the proper bind/connext semantics.""" import psutil @@ -100,13 +103,13 @@ def make_zmq_socket(ctx: Union[zmq.asyncio.Context, zmq.Context], path: str, else: buf_size = -1 - if type == zmq.PULL: - socket.setsockopt(zmq.RCVHWM, 0) - socket.setsockopt(zmq.RCVBUF, buf_size) + if type == zmq.constants.PULL: + socket.setsockopt(zmq.constants.RCVHWM, 0) + socket.setsockopt(zmq.constants.RCVBUF, buf_size) socket.bind(path) - elif type == zmq.PUSH: - socket.setsockopt(zmq.SNDHWM, 0) - socket.setsockopt(zmq.SNDBUF, buf_size) + elif type == zmq.constants.PUSH: + socket.setsockopt(zmq.constants.SNDHWM, 0) + socket.setsockopt(zmq.constants.SNDBUF, buf_size) socket.connect(path) else: raise ValueError(f"Unknown Socket Type: {type}") From 8372665b8cd1be253f0a753c62d1dcc074a3d10f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 23:25:44 +0000 Subject: [PATCH 122/132] stash --- vllm/entrypoints/openai/api_server.py | 2 +- vllm/utils.py | 2 +- vllm/v1/engine/async_llm.py | 16 +++++++++++----- vllm/v1/engine/core.py | 16 ++++++++++------ vllm/v1/engine/detokenizer.py | 27 ++++++++++++++------------- vllm/v1/engine/llm_engine.py | 8 ++++---- vllm/v1/utils.py | 12 ++++++------ 7 files changed, 47 insertions(+), 36 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 76d9a2bd714cd..4264ff22f8ab9 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -64,7 +64,7 @@ from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address, set_ulimit, kill_process_tree) + is_valid_ipv6_address, kill_process_tree, set_ulimit) from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds diff --git a/vllm/utils.py b/vllm/utils.py index caed96d200bfc..5c7635f4d3e82 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -16,9 +16,9 @@ import subprocess import sys import tempfile -import traceback import threading import time +import traceback import uuid import warnings import weakref diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 80ea0dc75234f..dbeea5f145fc2 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -16,13 +16,13 @@ # Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py import asyncio -import zmq -import zmq.asyncio import pickle - from typing import (Any, AsyncGenerator, Dict, List, Mapping, Optional, Type, Union) +import zmq +import zmq.asyncio + from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.metrics_types import StatLoggerBase @@ -39,7 +39,7 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import get_open_zmq_ipc_path -from vllm.v1.engine import EngineAbortRequest +from vllm.v1.engine import EngineAbortRequest, EngineRequestType from vllm.v1.engine.core import MPEngineCoreClient from vllm.v1.engine.detokenizer import MPDetokenizerClient from vllm.v1.engine.processor import Processor @@ -238,7 +238,7 @@ async def generate( The output_handler() loop runs in a background task, pulling from Detokenizer and pushing to the per request queue. - The generate() pulls from the per request queue and yeilds + The generate() pulls from the per request queue and yields to the caller which iterates the AsyncGenerator. """ @@ -313,6 +313,12 @@ async def abort(self, request_id: str): if self.log_requests: logger.info("Aborted %s.", request_id) + async def _send_to_detokenizer(self, obj: Any): + """Send object to Detokenizer with a FROM_ENGINE flag.""" + + msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object)) + self.to_detokenizer.send_multipart(msg, copy=False) + def encode( self, prompt: PromptType, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6ce4a6621080e..e4c587f1d7eb1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,14 +1,14 @@ -import psutil import queue import signal import threading import time +from multiprocessing.connection import Connection from typing import List, Tuple, Type +import psutil import zmq import zmq.asyncio from msgspec import msgpack -from multiprocessing.connection import Connection from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger @@ -17,14 +17,14 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils import get_exception_traceback from vllm.v1.core.scheduler import Scheduler -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineAbortRequest, EngineRequest, - EngineRequestType, EngineProfileRequest, +from vllm.v1.engine import (EngineAbortRequest, EngineCoreOutput, + EngineCoreOutputs, EngineProfileRequest, + EngineRequest, EngineRequestType, EngineRequestUnion) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus -from vllm.v1.utils import zmq_socket_ctx, MPBackgroundProcess +from vllm.v1.utils import MPBackgroundProcess, zmq_socket_ctx from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -310,3 +310,7 @@ def __init__(self, input_path: str, output_path: str, "usage_context": usage_context, }, ) + + async def profile_async(self, is_start: bool = True): + # TODO: enable this. + pass diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index c3d3e7db6a338..94d0741402efe 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,11 +1,12 @@ -import psutil import pickle -import zmq.asyncio -import msgspec import signal from dataclasses import dataclass from multiprocessing.connection import Connection -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Dict, Iterable, List, Optional, Tuple, Union + +import msgspec +import psutil +import zmq from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger @@ -15,10 +16,10 @@ AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import get_exception_traceback -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineRequestType, EngineRequest, - EngineAbortRequest) -from vllm.v1.utils import make_zmq_socket, MPBackgroundProcess +from vllm.v1.engine import (EngineAbortRequest, EngineCoreOutput, + EngineCoreOutputs, EngineRequest, + EngineRequestType) +from vllm.v1.utils import MPBackgroundProcess, make_zmq_socket logger = init_logger(__name__) @@ -348,9 +349,9 @@ def signal_handler(signum, frame): detokenizer = None def _handle_from_llm_engine( - self, - request_bytes: bytes, - to_engine_core: zmq.constants.Socket, + self, + request_bytes: bytes, + to_engine_core: zmq.Socket, # type: ignore[name-defined] ) -> None: """Handle EngineRequest from the LLMEngine.""" @@ -369,8 +370,8 @@ def _handle_from_llm_engine( def _handle_from_engine_core( self, output_bytes: bytes, - to_engine_core: zmq.constants.Socket, - to_llm_engine: zmq.constants.Socket, + to_engine_core: zmq.Socket, # type: ignore[name-defined] + to_llm_engine: zmq.Socket, # type: ignore[name-defined] decoder: msgspec.msgpack.Decoder, ) -> None: """Handle Outputs from the EngineCore.""" diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index efddf3a049cf5..b80a986b90433 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,8 +1,8 @@ +import pickle from typing import Any, Dict, List, Mapping, Optional, Type, Union -from typing_extensions import TypeVar import zmq -import pickle +from typing_extensions import TypeVar from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs @@ -198,7 +198,7 @@ def add_request( # Send to Detokenizer (which forwards to EngineCore). # Note: we forward the message rather than sending # to each process separately to avoid race conditions. - self.send_to_detokenizer(engine_request) + self._send_to_detokenizer(engine_request) else: # Add directly to Detokenizer and EngineCore. self.detokenizer.add_request(engine_request) @@ -221,7 +221,7 @@ def step(self) -> List[RequestOutput]: return request_outputs - def send_to_detokenizer(self, object: Any): + def _send_to_detokenizer(self, object: Any): """Send object to Detokenizer with a FROM_ENGINE flag.""" msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object)) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index ce0bccfa40f66..2f8208e09bbef 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,18 +1,18 @@ import os import weakref -from dataclasses import dataclass -from multiprocessing.process import BaseProcess from collections.abc import Sequence from contextlib import contextmanager -from typing import (Any, Generic, Dict, Iterator, List, Optional, TypeVar, - Union, Callable, overload) +from dataclasses import dataclass +from multiprocessing.process import BaseProcess +from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, + TypeVar, Union, overload) import zmq import zmq.asyncio +from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger from vllm.utils import kill_process_tree -from vllm.executor.multiproc_worker_utils import get_mp_context logger = init_logger(__name__) @@ -195,7 +195,7 @@ def wait_for_startup( # Wait for startup. if reader.recv()["status"] != "READY": - raise RuntimeError(f"{process_name} initalization failed. " + raise RuntimeError(f"{process_name} initialization failed. " "See root cause above.") return BackgroundProcHandle(proc, input_path, output_path) From 6b4f2bbe2ba2b9e79b8f5bd940cc73ac80b3bf2c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 23:39:10 +0000 Subject: [PATCH 123/132] almost there with llm engine --- vllm/v1/engine/llm_engine.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index b80a986b90433..25ac92cdb4ce8 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,5 +1,5 @@ import pickle -from typing import Any, Dict, List, Mapping, Optional, Type, Union +from typing import Any, Dict, List, Mapping, Optional, Set, Type, Union import zmq from typing_extensions import TypeVar @@ -68,6 +68,9 @@ def __init__( mm_registry=mm_registry) if self.multiprocess_mode: + # Keep track of active requests. + self.running_requests: Set[str] = set() + # IPC paths. to_detokenizer_path = get_open_zmq_ipc_path() to_engine_core_path = get_open_zmq_ipc_path() @@ -160,10 +163,13 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: return executor_class def get_num_unfinished_requests(self) -> int: - return self.detokenizer.get_num_unfinished_requests() + if self.multiprocess_mode: + return len(self.running_requests) + else: + return self.detokenizer.get_num_unfinished_requests() def has_unfinished_requests(self) -> bool: - return self.detokenizer.has_unfinished_requests() + return self.get_num_unfinished_requests() > 0 @classmethod def validate_outputs(cls, outputs, output_type): @@ -193,8 +199,10 @@ def add_request( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # Add to Detokenizer and EngineCore. + # Add processed input to system. if self.multiprocess_mode: + assert engine_request.request_id not in self.running_requests + self.running_requests.add(engine_request.request_id) # Send to Detokenizer (which forwards to EngineCore). # Note: we forward the message rather than sending # to each process separately to avoid race conditions. @@ -208,7 +216,14 @@ def step(self) -> List[RequestOutput]: if self.multiprocess_mode: # Get next output from the Detokenizer. - return self.from_detokenizer.recv_pyobj() + request_outputs: List[ + RequestOutput] = self.from_detokenizer.recv_pyobj() + + # Removed finsihed requests from the state tracker. + for out in request_outputs: + if out.finished: + self.running_requests.remove(out.request_id) + else: # Step EngineCore and Detokenizer. engine_core_outputs = self.engine_core.step() @@ -219,7 +234,7 @@ def step(self) -> List[RequestOutput]: if requests_to_abort: self.abort_request(requests_to_abort) - return request_outputs + return request_outputs def _send_to_detokenizer(self, object: Any): """Send object to Detokenizer with a FROM_ENGINE flag.""" From db7d055b15d35cc21b1f6dae8b120b2655faab14 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 23 Dec 2024 23:41:28 +0000 Subject: [PATCH 124/132] format' --- vllm/v1/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 25ac92cdb4ce8..c26d6543d6728 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -219,7 +219,7 @@ def step(self) -> List[RequestOutput]: request_outputs: List[ RequestOutput] = self.from_detokenizer.recv_pyobj() - # Removed finsihed requests from the state tracker. + # Removed finished requests from the state tracker. for out in request_outputs: if out.finished: self.running_requests.remove(out.request_id) From 98053d6eb3ce349da7b9bbd52ef4e626f4bbeed6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 24 Dec 2024 14:28:43 +0000 Subject: [PATCH 125/132] clean --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index dbeea5f145fc2..a1b44b19d274c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -232,7 +232,7 @@ async def generate( """ Main function called by the API server to kick off a request * 1) Make an output queue for the Request. - # 2) Processing the Input (e.g. Tokenizer). + * 2) Processing the Input (e.g. Tokenizer, MM). * 3) Adding the Request to Detokenizer + EngineCore. The output_handler() loop runs in a background task, pulling From 4713e29c2787409b7ca9472c6d2a3b9a2a674b9e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 24 Dec 2024 14:29:08 +0000 Subject: [PATCH 126/132] updated --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a1b44b19d274c..b2f5cc66b3f36 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -199,7 +199,7 @@ async def add_request( ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" - # 1) Convert Input --> EngineRequest. + # 1) Convert Input --> EngineRequest (Tokenize, MM, etc). engine_request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) From 4f946ebc749f0bf85f8d0a0149646c117d2249f1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 24 Dec 2024 14:30:15 +0000 Subject: [PATCH 127/132] nit --- vllm/v1/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 2f8208e09bbef..5dfa7470c3a47 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -89,7 +89,7 @@ def make_zmq_socket( path: str, type: Any, ) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] - """Make a ZMQ socket with the proper bind/connext semantics.""" + """Make a ZMQ socket with the proper bind/connect semantics.""" import psutil mem = psutil.virtual_memory() From 59c64300d792cd3b42641c0172a9a03073beccd1 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Tue, 24 Dec 2024 09:31:22 -0500 Subject: [PATCH 128/132] Update vllm/v1/utils.py Co-authored-by: Michael Goin --- vllm/v1/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 5dfa7470c3a47..49b0cf19fd851 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -96,12 +96,17 @@ def make_zmq_socket( socket = ctx.socket(type) + # Calculate buffer size based on system memory total_mem = mem.total / 1024**3 available_mem = mem.available / 1024**3 + # For systems with substantial memory (>32GB total, >16GB available): + # - Set a large 0.5GB buffer to improve throughput + # For systems with less memory: + # - Use system default (-1) to avoid excessive memory consumption if total_mem > 32 and available_mem > 16: - buf_size = int(0.5 * 1024**3) + buf_size = int(0.5 * 1024**3) # 0.5GB in bytes else: - buf_size = -1 + buf_size = -1 # Use system default buffer size if type == zmq.constants.PULL: socket.setsockopt(zmq.constants.RCVHWM, 0) From 856838d1838575a149df042b6d544da3b2715e0a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 24 Dec 2024 14:37:09 +0000 Subject: [PATCH 129/132] updated --- vllm/v1/engine/core.py | 3 +++ vllm/v1/engine/detokenizer.py | 3 +++ vllm/v1/engine/llm_engine.py | 5 ++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e4c587f1d7eb1..ace4bb8bfed1d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -210,7 +210,10 @@ def run_busy_loop(self): """Core busy loop of the EngineCore.""" # Loop until process is sent a SIGINT or SIGTERM + i = 0 while True: + print(f"EPOCH: {i}") + i += 1 # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): while True: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 94d0741402efe..70ef3d018c2c6 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -402,7 +402,10 @@ def run_busy_loop(self): zmq.constants.PUSH) to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, zmq.constants.PUSH) + i = 0 while True: + print(f"EPOCH: {i}") + i += 1 (msg_type, msg_bytes) = input_socket.recv_multipart() # Handle message from LLMEngine (Abort or New Request). diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 3e707af7bc10e..2b4a0ec855124 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -25,8 +25,8 @@ from vllm.v1.engine.detokenizer import Detokenizer, MPDetokenizerClient from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.utils import make_zmq_socket from vllm.v1.executor.ray_utils import initialize_ray_cluster +from vllm.v1.utils import make_zmq_socket logger = init_logger(__name__) @@ -199,6 +199,8 @@ def add_request( priority: int = 0, ) -> None: + logger.info("Added request.") + # Process raw inputs into the request. engine_request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, @@ -219,6 +221,7 @@ def add_request( def step(self) -> List[RequestOutput]: + logger.info("Called step.") if self.multiprocess_mode: # Get next output from the Detokenizer. request_outputs: List[ From 94fe4afbb397cd621dbd5525f8756ef3fe478528 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 24 Dec 2024 14:50:41 +0000 Subject: [PATCH 130/132] updated --- vllm/v1/engine/async_llm.py | 6 +++--- vllm/v1/engine/core.py | 3 --- vllm/v1/engine/detokenizer.py | 4 +--- vllm/v1/engine/llm_engine.py | 2 -- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b2f5cc66b3f36..cb87985449b9e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -210,7 +210,7 @@ async def add_request( # 3) Send to Detokenizer (which forwards to EngineCore). # Note: we forward the request rather than sending to each # process separately to avoid race conditions in Detokenizer). - await self.send_to_detokenizer(engine_request) + await self._send_to_detokenizer(engine_request) return self.rid_to_queue[request_id] @@ -304,7 +304,7 @@ async def abort(self, request_id: str): """Abort request if the client cancels the request.""" # Send abort to Detokenizer (which will fwd to EngineCore). - await self.send_to_detokenizer(EngineAbortRequest([request_id])) + await self._send_to_detokenizer(EngineAbortRequest([request_id])) # Remove from request output queues. if request_id in self.rid_to_queue: @@ -317,7 +317,7 @@ async def _send_to_detokenizer(self, obj: Any): """Send object to Detokenizer with a FROM_ENGINE flag.""" msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object)) - self.to_detokenizer.send_multipart(msg, copy=False) + await self.to_detokenizer.send_multipart(msg, copy=False) def encode( self, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ace4bb8bfed1d..e4c587f1d7eb1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -210,10 +210,7 @@ def run_busy_loop(self): """Core busy loop of the EngineCore.""" # Loop until process is sent a SIGINT or SIGTERM - i = 0 while True: - print(f"EPOCH: {i}") - i += 1 # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): while True: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 70ef3d018c2c6..2d8724e687448 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -402,10 +402,8 @@ def run_busy_loop(self): zmq.constants.PUSH) to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, zmq.constants.PUSH) - i = 0 + while True: - print(f"EPOCH: {i}") - i += 1 (msg_type, msg_bytes) = input_socket.recv_multipart() # Handle message from LLMEngine (Abort or New Request). diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 2b4a0ec855124..149373023c8f2 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -220,8 +220,6 @@ def add_request( self.engine_core.add_request(engine_request) def step(self) -> List[RequestOutput]: - - logger.info("Called step.") if self.multiprocess_mode: # Get next output from the Detokenizer. request_outputs: List[ From 127045a682024d0ca6ab09a70a8f00987fdcd8b9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 24 Dec 2024 14:59:57 +0000 Subject: [PATCH 131/132] stash --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index cb87985449b9e..d82f278e36744 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -316,7 +316,7 @@ async def abort(self, request_id: str): async def _send_to_detokenizer(self, obj: Any): """Send object to Detokenizer with a FROM_ENGINE flag.""" - msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object)) + msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(obj)) await self.to_detokenizer.send_multipart(msg, copy=False) def encode( From 1352386fad6ac73bd62611e86cc8c5d9411968bf Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 24 Dec 2024 15:03:38 +0000 Subject: [PATCH 132/132] remove log --- vllm/v1/engine/llm_engine.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 149373023c8f2..536fdb28717b4 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -199,8 +199,6 @@ def add_request( priority: int = 0, ) -> None: - logger.info("Added request.") - # Process raw inputs into the request. engine_request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request,