From c2ad07cf492a02020ba3fcf13186ab77007d3657 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 03:42:28 +0000
Subject: [PATCH 001/132] first rev of 3 process architecture

---
 examples/openai_completion_client.py   |   8 +-
 vllm/v1/engine/__init__.py             |  23 ++-
 vllm/v1/engine/async_llm.py            |  52 ++++---
 vllm/v1/engine/core.py                 |  78 ++++------
 vllm/v1/engine/core_client.py          |  47 ++----
 vllm/v1/engine/detokenizer.py          | 201 ++++++++++++++++++++++++-
 vllm/v1/executor/multiproc_executor.py |   6 +-
 vllm/v1/utils.py                       |  58 +++++--
 8 files changed, 343 insertions(+), 130 deletions(-)

diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
index 58519f978d340..1f8b82bc5c9e9 100644
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -2,7 +2,7 @@
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+openai_api_base = "http://localhost:8001/v1"
 
 client = OpenAI(
     # defaults to os.environ.get("OPENAI_API_KEY")
@@ -14,14 +14,12 @@
 model = models.data[0].id
 
 # Completion API
-stream = False
+stream = True
 completion = client.completions.create(
     model=model,
     prompt="A robot may not injure a human being",
     echo=False,
-    n=2,
-    stream=stream,
-    logprobs=3)
+    stream=stream)
 
 print("Completion results:")
 if stream:
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index cc0c7ea23469a..ee6b90b1bab1f 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,5 +1,6 @@
 import enum
 from dataclasses import dataclass
+from multiprocessing.process import BaseProcess
 from typing import List, Optional, Union
 
 import msgspec
@@ -10,7 +11,18 @@
 
 
 @dataclass
-class DetokenizerRequest:
+class BackgroundProcHandle:
+    proc: BaseProcess
+    ready_path: str
+    input_path: str
+    output_path: str
+
+
+class DetokenizerRequest(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     request_id: str
     prompt: Optional[str]
@@ -75,6 +87,15 @@ class EngineCoreProfile:
     is_start: bool
 
 
+class DetokenizerRequestType(enum.Enum):
+    """
+    Request types defined as hex byte strings, so it can be sent over sockets
+    without separate encoding step.
+    """
+    NEW = b'\x00'
+    OUT = b'\x01'
+
+
 class EngineCoreRequestType(enum.Enum):
     """
     Request types defined as hex byte strings, so it can be sent over sockets
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b36de5f66917c..6b158ca5f667b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -16,9 +16,10 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import get_open_zmq_ipc_path
 from vllm.v1.engine.async_stream import AsyncStream
-from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.core_client import AsyncMPClient
+from vllm.v1.engine.detokenizer import DetokenizerClient
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 
@@ -64,8 +65,13 @@ def __init__(
                                    vllm_config.lora_config, self.tokenizer,
                                    input_registry)
 
+
+        # IPC path for EngineCore -> Detokenizer.
+        engine_core_outputs_path = get_open_zmq_ipc_path()
+
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
-        self.detokenizer = Detokenizer(
+        self.detokenizer = DetokenizerClient(
+            engine_core_outputs_path=engine_core_outputs_path,
             tokenizer_name=vllm_config.model_config.tokenizer,
             tokenizer_mode=vllm_config.model_config.tokenizer_mode,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
@@ -73,12 +79,11 @@ def __init__(
         )
 
         # EngineCore (starts the engine in background process).
-        self.engine_core = EngineCoreClient.make_client(
+        self.engine_core = AsyncMPClient(
+            output_path=engine_core_outputs_path,
             vllm_config=vllm_config,
             executor_class=executor_class,
             usage_context=usage_context,
-            multiprocess_mode=True,
-            asyncio_mode=True,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
@@ -121,6 +126,9 @@ def shutdown(self):
 
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
+        
+        if detokenizer := getattr(self, "detokenizer", None):
+            detokenizer.shutdown()
 
         if handler := getattr(self, "output_handler", None):
             handler.cancel()
@@ -152,8 +160,8 @@ async def add_request(
     ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         """Add new request to the AsyncLLM."""
 
-        if self.detokenizer.is_request_active(request_id):
-            raise ValueError(f"Request {request_id} already exists.")
+        # if self.detokenizer.is_request_active(request_id):
+        #     raise ValueError(f"Request {request_id} already exists.")
 
         # 1) Create a new AsyncStream for the request.
         stream = self._add_request_to_streams(request_id)
@@ -163,10 +171,10 @@ async def add_request(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
-        # 3) Add the request to Detokenizer (this process).
-        self.detokenizer.add_request(detokenizer_req)
+        # 3) Add the DetokenizerRequest to Detokenizer.
+        await self.detokenizer.add_request_async(detokenizer_req)
 
-        # 4) Add the EngineCoreRequest to EngineCore (separate process).
+        # 4) Add the EngineCoreRequest to EngineCore.
         await self.engine_core.add_request_async(engine_core_req)
 
         # 5) Return the generator.
@@ -296,29 +304,26 @@ async def _run_output_handler(self):
 
         try:
             while True:
-                # 1) Pull EngineCoreOutput from the EngineCore.
-                outputs = await self.engine_core.get_output_async()
+                # 1) Pull outputs from the Detokenizer.
+                request_outputs, reqs_to_abort = (
+                    await self.detokenizer.get_output_async())
 
-                # 2) Detokenize based on the output.
-                request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
-
-                # 3) Put the RequestOutputs into the per-request AsyncStreams.
+                # 2) Put the RequestOutputs into the per-request AsyncStreams.
                 self._process_request_outputs(request_outputs)
 
-                # 4) Abort any requests that finished due to stop strings.
+                # 3) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
-                # 5) Abort any requests due to client cancellations.
+                # 4) Abort any requests due to client cancellations.
+                # TODO: send back to detokenizer if this fails.
                 await self._process_cancellations()
 
         except BaseException as e:
             logger.error(e)
             raise e
 
-    # TODO: can we eliminate these?
-
     async def abort(self, request_id: str) -> None:
-        # Note: Who Calls this? I dont think this is actually used.
+        # Note: this is not used outside of testing.
         raise ValueError("Not Supported on V1 yet.")
 
     def encode(
@@ -345,8 +350,7 @@ async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
-        assert lora_request is None
-        return self.detokenizer.tokenizer
+        return self.tokenizer.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
         return False
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 56d4dc67e4a0e..dcafac4ad2463 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,8 +3,6 @@
 import signal
 import threading
 import time
-from dataclasses import dataclass
-from multiprocessing.process import BaseProcess
 from typing import List, Tuple, Type
 
 import zmq
@@ -17,15 +15,18 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import get_open_zmq_ipc_path
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion)
+                            EngineCoreRequestType, EngineCoreRequestUnion,
+                            DetokenizerRequestType,
+                            BackgroundProcHandle)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
-from vllm.v1.utils import make_zmq_socket
+from vllm.v1.utils import zmq_socket_ctx, wait_for_startup
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -134,14 +135,6 @@ def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
 
-@dataclass
-class EngineCoreProcHandle:
-    proc: BaseProcess
-    ready_path: str
-    input_path: str
-    output_path: str
-
-
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
@@ -173,38 +166,11 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+        logger.info("ABOUT TO SEND READINESS")
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
+            logger.info("SENDING READY SIGNAL")
             ready_socket.send_string(EngineCoreProc.READY_STR)
 
-    @staticmethod
-    def wait_for_startup(
-        proc: BaseProcess,
-        ready_path: str,
-    ) -> None:
-        """Wait until the EngineCore is ready."""
-
-        try:
-            sync_ctx = zmq.Context()  # type: ignore[attr-defined]
-            socket = sync_ctx.socket(zmq.constants.PULL)
-            socket.connect(ready_path)
-
-            # Wait for EngineCore to send EngineCoreProc.READY_STR.
-            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.debug("Waiting for EngineCoreProc to startup.")
-
-                if not proc.is_alive():
-                    raise RuntimeError("EngineCoreProc failed to start.")
-
-            message = socket.recv_string()
-            assert message == EngineCoreProc.READY_STR
-
-        except BaseException as e:
-            logger.exception(e)
-            raise e
-
-        finally:
-            sync_ctx.destroy(linger=0)
-
     @staticmethod
     def make_engine_core_process(
         vllm_config: VllmConfig,
@@ -212,9 +178,9 @@ def make_engine_core_process(
         usage_context: UsageContext,
         input_path: str,
         output_path: str,
-        ready_path: str,
-    ) -> EngineCoreProcHandle:
+    ) -> BackgroundProcHandle:
         context = get_mp_context()
+        ready_path = get_open_zmq_ipc_path()
 
         process_kwargs = {
             "input_path": input_path,
@@ -228,10 +194,14 @@ def make_engine_core_process(
         proc = context.Process(target=EngineCoreProc.run_engine_core,
                                kwargs=process_kwargs)
         proc.start()
-
-        # Wait for startup
-        EngineCoreProc.wait_for_startup(proc, ready_path)
-        return EngineCoreProcHandle(proc=proc,
+        logger.info("WAITING FOR STARTUP")
+        wait_for_startup(proc=proc,
+                         ready_path=ready_path,
+                         ready_str=EngineCoreProc.READY_STR,
+                         timeout_ms=POLLING_TIMEOUT_MS)
+        logger.info("READY")
+
+        return BackgroundProcHandle(proc=proc,
                                     ready_path=ready_path,
                                     input_path=input_path,
                                     output_path=output_path)
@@ -284,12 +254,13 @@ def run_busy_loop(self):
             if not self.scheduler.has_unfinished_requests():
                 while True:
                     try:
+                        logger.info("getting from input queue")
                         req = self.input_queue.get(timeout=POLLING_TIMEOUT_S)
                         self._handle_client_request(req)
                         break
                     except queue.Empty:
                         self._log_stats()
-                        logger.debug("EngineCore busy loop waiting.")
+                        logger.info("EngineCore busy loop waiting.")
                     except BaseException:
                         raise
 
@@ -302,6 +273,7 @@ def run_busy_loop(self):
             outputs = self.step()
 
             # 4) Put EngineCoreOutputs into the output queue.
+            logger.info("putting to output queue")
             self.output_queue.put_nowait(outputs)
 
             self._log_stats()
@@ -339,7 +311,7 @@ def process_input_socket(self, input_path: str):
         decoder_add_req = PickleEncoder()
         decoder_abort_req = PickleEncoder()
 
-        with make_zmq_socket(input_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
@@ -367,9 +339,11 @@ def process_output_socket(self, output_path: str):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with make_zmq_socket(output_path, zmq.constants.PUSH) as socket:
+        logger.info(f"{output_path=}")
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
                 encoder.encode_into(outputs, buffer)
-                socket.send_multipart((buffer, ), copy=False)
+                msg = (DetokenizerRequestType.OUT.value, buffer)
+                socket.send_multipart(msg, copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index ff25a9b2e9cac..cfd3edab13877 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -7,13 +7,14 @@
 import zmq.asyncio
 
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
+from vllm.utils import kill_process_tree, get_open_zmq_ipc_path
+from vllm.v1.engine import (BackgroundProcHandle,
+                            EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
-from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
-                                 EngineCoreProcHandle)
+from vllm.v1.engine.core import (EngineCore, EngineCoreProc)
 from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.utils import make_zmq_socket
 
 logger = init_logger(__name__)
 
@@ -129,6 +130,7 @@ class MPClient(EngineCoreClient):
     def __init__(
         self,
         *args,
+        output_path: str,
         asyncio_mode: bool,
         **kwargs,
     ):
@@ -142,27 +144,19 @@ def __init__(
         else:
             self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
-        # Path for IPC.
-        ready_path = get_open_zmq_ipc_path()
-        output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
-
-        # Get output (EngineCoreOutput) from EngineCore.
-        self.output_socket = self.ctx.socket(zmq.constants.PULL)
-        self.output_socket.connect(output_path)
-
-        # Send input (EngineCoreRequest) to EngineCore.
-        self.input_socket = self.ctx.socket(zmq.constants.PUSH)
-        self.input_socket.bind(input_path)
+        self.input_socket = make_zmq_socket(
+            self.ctx,
+            input_path,
+            zmq.constants.PUSH,
+        )
 
         # Start EngineCore in background process.
-        self.proc_handle: Optional[EngineCoreProcHandle]
+        self.proc_handle: Optional[BackgroundProcHandle]
         self.proc_handle = EngineCoreProc.make_engine_core_process(
             *args,
-            input_path=
-            input_path,  # type: ignore[misc]  # MyPy incorrectly flags duplicate keywords
-            output_path=output_path,  # type: ignore[misc]
-            ready_path=ready_path,  # type: ignore[misc]
+            input_path=input_path,
+            output_path=output_path,
             **kwargs,
         )
         atexit.register(self.shutdown)
@@ -207,12 +201,6 @@ class SyncMPClient(MPClient):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, asyncio_mode=False, **kwargs)
 
-    def get_output(self) -> List[EngineCoreOutput]:
-
-        (frame, ) = self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frame.buffer).outputs
-        return engine_core_outputs
-
     def _send_input(self, request_type: EngineCoreRequestType,
                     request: EngineCoreRequestUnion) -> None:
 
@@ -237,13 +225,6 @@ class AsyncMPClient(MPClient):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, asyncio_mode=True, **kwargs)
 
-    async def get_output_async(self) -> List[EngineCoreOutput]:
-
-        frames = await self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs
-
-        return engine_core_outputs
-
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 02f34e2b54dd5..a2f8c4a29b662 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,17 +1,29 @@
+import pickle
+import zmq.asyncio
+import msgspec
+import signal
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
+from vllm.utils import get_open_zmq_ipc_path
+from vllm.v1.engine import (DetokenizerRequest, DetokenizerRequestType,
+                            EngineCoreOutput, EngineCoreOutputs, 
+                            BackgroundProcHandle,)
+from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, 
+                           wait_for_startup)
+from vllm.v1.serial_utils import PickleEncoder
 
 logger = init_logger(__name__)
 
+POLLING_TIMEOUT_MS = 5000
 
 @dataclass
 class IncrementalDetokenizer:
@@ -270,3 +282,190 @@ def step(
 
         # Return to EngineClient.
         return request_outputs, requests_to_abort
+
+class DetokenizerProc(Detokenizer):
+    """ZMQ-wrapper for running Detokenizer in background process."""
+
+    READY_STR = "READY"
+
+    def __init__(
+        self,
+        *args,
+        engine_core_outputs_path: str,
+        input_path: str,
+        output_path: str,
+        ready_path: str,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.engine_core_outputs_path = engine_core_outputs_path
+        self.input_path = input_path
+        self.output_path = output_path
+
+        # Send readiness signal.
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
+            ready_socket.send_string(DetokenizerProc.READY_STR)
+
+
+    @staticmethod
+    def make_detokenizer_process(
+        engine_core_outputs_path: str,
+        input_path: str,
+        output_path: str,
+        tokenizer_name: str,
+        tokenizer_mode: str = "auto",
+        trust_remote_code: bool = False,
+        revision: Optional[str] = None,
+    ) -> BackgroundProcHandle:
+        context = get_mp_context()
+        ready_path = get_open_zmq_ipc_path()
+
+        process_kwargs = {
+            "engine_core_outputs_path": engine_core_outputs_path,
+            "input_path": input_path,
+            "output_path": output_path,
+            "ready_path": ready_path,
+            "tokenizer_name": tokenizer_name,
+            "tokenizer_mode": tokenizer_mode,
+            "trust_remote_code": trust_remote_code,
+            "revision": revision,
+        }
+        # Run Detokenizer busy loop in background process.
+        proc = context.Process(target=DetokenizerProc.run_detokenizer,
+                               kwargs=process_kwargs)
+        proc.start()
+        wait_for_startup(proc=proc,
+                         ready_path=ready_path,
+                         ready_str=DetokenizerProc.READY_STR,
+                         timeout_ms=POLLING_TIMEOUT_MS)
+
+        return BackgroundProcHandle(proc=proc,
+                                    ready_path=ready_path,
+                                    input_path=input_path,
+                                    output_path=output_path)
+    
+    @staticmethod
+    def run_detokenizer(*args, **kwargs):
+        """Launch Detokenizer busy loop in background process."""
+
+        # Signal handler used for graceful termination.
+        # SystemExit exception is only raised once to allow this and worker
+        # processes to terminate without error
+        shutdown_requested = False
+
+        def signal_handler(signum, frame):
+            nonlocal shutdown_requested
+            if not shutdown_requested:
+                shutdown_requested = True
+                raise SystemExit()
+
+        # Either SIGTERM or SIGINT will terminate the engine_core
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        detokenizer = None
+        try:
+            detokenizer = DetokenizerProc(*args, **kwargs)
+            detokenizer.run_busy_loop()
+
+        except SystemExit:
+            logger.debug("Detokenizer interrupted.")
+
+        except BaseException as e:
+            logger.exception(e)
+            raise e
+
+        finally:
+            if detokenizer is not None:
+                detokenizer = None
+
+    def run_busy_loop(self):
+        """Core busy loop of the Detokenizer."""
+
+        decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest)
+        decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs)
+
+        with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, 
+              zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket,
+              zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket):
+
+            # TODO: make this work without poll by having both EngineCore
+            # and AsyncLLM send to the same socket (unclear why this was not working
+            # when I originally tried it)
+            poller = zmq.Poller()
+            poller.register(engine_core_outputs_socket, zmq.POLLIN)
+            poller.register(input_socket, zmq.POLLIN)
+
+            while True:
+                socks = dict(poller.poll())
+
+                # Handle NewRequest
+                if input_socket in socks:
+                    (frame, ) = input_socket.recv_multipart(copy=False)
+                    detokenizer_request = decoder_new.decode(frame.buffer)
+                    self.add_request(detokenizer_request)
+
+                # Handle EngineCoreOutput
+                if from_engine_core_socket in socks:
+                    (frame, ) = from_engine_core_socket.recv_multipart(copy=False)
+                    engine_core_outputs = decoder_out.decode(frame.buffer).outputs
+                    outputs = self.step(engine_core_outputs)
+                    msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL)
+                    output_socket.send_multipart((msg, ), copy=False)
+            
+                # TODO: handle aborted due to client cancellation
+                # TODO: pickle -> msgpack
+                # TODO: send stop string aborts back to EngineCore directly
+
+        
+class DetokenizerClient:
+    
+    def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
+
+        # Serialization setup.
+        self.encoder = msgspec.msgpack.Encoder()
+        self.decoder = PickleEncoder()
+        
+        # ZMQ setup.
+        self.ctx = zmq.asyncio.Context()
+
+        # Get input (DetokenizerRequest) to Detokenizer.
+        input_path = get_open_zmq_ipc_path()
+        self.input_socket = make_zmq_socket(
+            self.ctx,
+            input_path,
+            zmq.constants.PUSH,
+        )
+
+        # Get output (RequestOutput) from Detokenizer.
+        output_path = get_open_zmq_ipc_path()
+        self.output_socket = make_zmq_socket(
+            self.ctx,
+            output_path,
+            zmq.constants.PULL,
+        )
+        self.output_socket.connect(output_path)        
+
+        # Start Detokenizer in background process.
+        self.proc_handle: Optional[BackgroundProcHandle]
+        self.proc_handle = DetokenizerProc.make_detokenizer_process(
+            *args,
+            engine_core_outputs_path=engine_core_outputs_path,
+            input_path=input_path,
+            output_path=output_path,
+            **kwargs,
+        )
+    
+    async def add_request_async(self, request: DetokenizerRequest):
+        """Send new DetokenizerRequest to Detokenizer."""
+
+        msg = (DetokenizerRequestType.NEW.value, self.encoder.encode(request))
+        await self.input_socket.send_multipart(msg, copy=False)
+
+
+    async def get_output_async(self) -> Tuple[List[RequestOutput], List[str]]:
+        """Get RequestOutputs, RequestsToAbort from Detokenizer."""
+
+        (frame, ) = await self.output_socket.recv_multipart(copy=False)
+        return self.decoder.decode(frame.buffer)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 17441dacdc5cf..aa246f778f8f3 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -23,7 +23,7 @@
                         get_open_zmq_ipc_path)
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.utils import make_zmq_socket
+from vllm.v1.utils import zmq_socket_ctx
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -254,7 +254,7 @@ def __init__(
         worker_response_mq_handle = self.worker_response_mq.export_handle()
 
         # Send Readiness signal to EngineCore process.
-        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
             payload = pickle.dumps(worker_response_mq_handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
             ready_socket.send_string(WorkerProc.READY_STR)
@@ -356,7 +356,7 @@ def wait_for_startup(
         ready_path: str,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with make_zmq_socket(ready_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
 
             # Wait for Worker to send READY.
             while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 5f327d7066830..ecd98f246b064 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,3 +1,5 @@
+from multiprocessing.process import BaseProcess
+
 from collections import OrderedDict
 from collections.abc import Sequence
 from contextlib import contextmanager
@@ -5,6 +7,7 @@
                     overload)
 
 import zmq
+import zmq.asyncio
 
 from vllm.logger import init_logger
 
@@ -78,24 +81,33 @@ def __len__(self):
         return len(self._x)
 
 
-@contextmanager
 def make_zmq_socket(
+        ctx: Union[zmq.asyncio.Context, zmq.Context],
+        path: str,
+        type: Any
+    ) -> Union[zmq.Socket, zmq.asyncio.Socket]:
+    """Make a ZMQ socket with the proper bind/connext semantics."""
+
+    socket = ctx.socket(type)
+
+    if type == zmq.constants.PULL:
+        socket.connect(path)
+    elif type == zmq.constants.PUSH:
+        socket.bind(path)
+    else:
+        raise ValueError(f"Unknown Socket Type: {type}")
+
+    return socket
+
+@contextmanager
+def zmq_socket_ctx(
         path: str,
         type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
     """Context manager for a ZMQ socket"""
 
     ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
-        socket = ctx.socket(type)
-
-        if type == zmq.constants.PULL:
-            socket.connect(path)
-        elif type == zmq.constants.PUSH:
-            socket.bind(path)
-        else:
-            raise ValueError(f"Unknown Socket Type: {type}")
-
-        yield socket
+        yield make_zmq_socket(ctx, path, type)
 
     except KeyboardInterrupt:
         logger.debug("Worker had Keyboard Interrupt.")
@@ -104,6 +116,30 @@ def make_zmq_socket(
         ctx.destroy(linger=0)
 
 
+def wait_for_startup(
+    proc: BaseProcess,
+    ready_path: str,
+    ready_str: str,
+    timeout_ms: int,
+) -> None:
+    """Wait until a background process is ready."""
+
+    with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
+        try:
+            while socket.poll(timeout=timeout_ms) == 0:
+                logger.debug("Waiting for background proc to startup.")
+
+                if not proc.is_alive():
+                    raise RuntimeError("Background process failed to start.")
+
+            message = socket.recv_string()
+            assert message == ready_str
+
+        except BaseException as e:
+            logger.exception(e)
+            raise e
+
+
 K = TypeVar('K')
 V = TypeVar('V')
 

From f0b3e36e48c121b5139e36ec871e1631bab13afd Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 03:45:15 +0000
Subject: [PATCH 002/132] finally able to generate text

---
 vllm/v1/engine/core.py        | 6 ++----
 vllm/v1/engine/detokenizer.py | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index dcafac4ad2463..f8b49abd5b385 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -194,12 +194,10 @@ def make_engine_core_process(
         proc = context.Process(target=EngineCoreProc.run_engine_core,
                                kwargs=process_kwargs)
         proc.start()
-        logger.info("WAITING FOR STARTUP")
         wait_for_startup(proc=proc,
                          ready_path=ready_path,
                          ready_str=EngineCoreProc.READY_STR,
                          timeout_ms=POLLING_TIMEOUT_MS)
-        logger.info("READY")
 
         return BackgroundProcHandle(proc=proc,
                                     ready_path=ready_path,
@@ -273,7 +271,6 @@ def run_busy_loop(self):
             outputs = self.step()
 
             # 4) Put EngineCoreOutputs into the output queue.
-            logger.info("putting to output queue")
             self.output_queue.put_nowait(outputs)
 
             self._log_stats()
@@ -345,5 +342,6 @@ def process_output_socket(self, output_path: str):
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
                 encoder.encode_into(outputs, buffer)
-                msg = (DetokenizerRequestType.OUT.value, buffer)
+                # msg = (DetokenizerRequestType.OUT.value, buffer)
+                msg = (buffer, )
                 socket.send_multipart(msg, copy=False)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index a2f8c4a29b662..5dff221166314 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -407,8 +407,8 @@ def run_busy_loop(self):
                     self.add_request(detokenizer_request)
 
                 # Handle EngineCoreOutput
-                if from_engine_core_socket in socks:
-                    (frame, ) = from_engine_core_socket.recv_multipart(copy=False)
+                if engine_core_outputs_socket in socks:
+                    (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False)
                     engine_core_outputs = decoder_out.decode(frame.buffer).outputs
                     outputs = self.step(engine_core_outputs)
                     msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL)
@@ -460,7 +460,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
     async def add_request_async(self, request: DetokenizerRequest):
         """Send new DetokenizerRequest to Detokenizer."""
 
-        msg = (DetokenizerRequestType.NEW.value, self.encoder.encode(request))
+        msg = (self.encoder.encode(request), )
         await self.input_socket.send_multipart(msg, copy=False)
 
 

From ce8aa2c35cad19e56f3899d8df2627750b600cf3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 04:16:51 +0000
Subject: [PATCH 003/132] breaking under load

---
 vllm/v1/engine/async_llm.py   | 11 +++++-
 vllm/v1/engine/core.py        |  3 +-
 vllm/v1/engine/detokenizer.py | 74 ++++++++++++++++++-----------------
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6b158ca5f667b..d007712b6359e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -286,7 +286,9 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
 
         for request_output in request_outputs:
             request_id = request_output.request_id
-            assert request_id in self.request_streams
+            logger.debug("about to assert")
+            if request_id not in self.request_streams:
+                raise ValueError("%s not found in Request Steams", request_id)
 
             # Each request in the API server pulls from the per-request stream.
             stream = self.request_streams.get(request_id)
@@ -305,20 +307,24 @@ async def _run_output_handler(self):
         try:
             while True:
                 # 1) Pull outputs from the Detokenizer.
+                logger.debug("get_output_async")
                 request_outputs, reqs_to_abort = (
                     await self.detokenizer.get_output_async())
 
                 # 2) Put the RequestOutputs into the per-request AsyncStreams.
+                logger.debug("_process_request_outputs")
                 self._process_request_outputs(request_outputs)
 
                 # 3) Abort any requests that finished due to stop strings.
+                logger.debug("abort_requests_async")
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
                 # 4) Abort any requests due to client cancellations.
                 # TODO: send back to detokenizer if this fails.
+                logger.debug("process_cancellations")
                 await self._process_cancellations()
 
-        except BaseException as e:
+        except Exception as e:
             logger.error(e)
             raise e
 
@@ -350,6 +356,7 @@ async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
+        logger.debug("Called get_tokenizer.")
         return self.tokenizer.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f8b49abd5b385..8af753f4f39ab 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -33,7 +33,7 @@
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5000
+LOGGING_TIME_S = 5
 
 
 class EngineCore:
@@ -252,7 +252,6 @@ def run_busy_loop(self):
             if not self.scheduler.has_unfinished_requests():
                 while True:
                     try:
-                        logger.info("getting from input queue")
                         req = self.input_queue.get(timeout=POLLING_TIMEOUT_S)
                         self._handle_client_request(req)
                         break
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 5dff221166314..1caa656870ddc 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -383,41 +383,45 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the Detokenizer."""
 
-        decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest)
-        decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs)
-
-        with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, 
-              zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket,
-              zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket):
-
-            # TODO: make this work without poll by having both EngineCore
-            # and AsyncLLM send to the same socket (unclear why this was not working
-            # when I originally tried it)
-            poller = zmq.Poller()
-            poller.register(engine_core_outputs_socket, zmq.POLLIN)
-            poller.register(input_socket, zmq.POLLIN)
-
-            while True:
-                socks = dict(poller.poll())
-
-                # Handle NewRequest
-                if input_socket in socks:
-                    (frame, ) = input_socket.recv_multipart(copy=False)
-                    detokenizer_request = decoder_new.decode(frame.buffer)
-                    self.add_request(detokenizer_request)
-
-                # Handle EngineCoreOutput
-                if engine_core_outputs_socket in socks:
-                    (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False)
-                    engine_core_outputs = decoder_out.decode(frame.buffer).outputs
-                    outputs = self.step(engine_core_outputs)
-                    msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL)
-                    output_socket.send_multipart((msg, ), copy=False)
-            
-                # TODO: handle aborted due to client cancellation
-                # TODO: pickle -> msgpack
-                # TODO: send stop string aborts back to EngineCore directly
-
+        try:
+            # TODO: handle aborted due to client cancellation
+            # TODO: pickle -> msgpack
+            # TODO: send stop string aborts back to EngineCore directly
+
+            decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest)
+            decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs)
+
+            with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, 
+                zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket,
+                zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket):
+
+                # TODO: make this work without poll by having both EngineCore
+                # and AsyncLLM send to the same socket (unclear why this was not working
+                # when I originally tried it)
+                poller = zmq.Poller()
+                poller.register(engine_core_outputs_socket, zmq.POLLIN)
+                poller.register(input_socket, zmq.POLLIN)
+
+                while True:
+                    socks = dict(poller.poll())
+
+                    # Handle NewRequest
+                    if input_socket in socks:
+                        (frame, ) = input_socket.recv_multipart(copy=False)
+                        detokenizer_request = decoder_new.decode(frame.buffer)
+                        self.add_request(detokenizer_request)
+
+                    # Handle EngineCoreOutput
+                    if engine_core_outputs_socket in socks:
+                        (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False)
+                        engine_core_outputs = decoder_out.decode(frame.buffer).outputs
+                        outputs = self.step(engine_core_outputs)
+                        msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL)
+                        output_socket.send_multipart((msg, ), copy=False)
+        
+        except Exception as e:
+            logger.error(e)
+            raise e
         
 class DetokenizerClient:
     

From 457d6184b8521a43d139abe3e0a3e22d4b956dc5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 15:42:29 +0000
Subject: [PATCH 004/132] working e2e

---
 benchmarks/backend_request_func.py |  4 +++-
 vllm/v1/core/scheduler.py          |  2 +-
 vllm/v1/engine/async_llm.py        |  8 +-------
 vllm/v1/engine/core.py             |  5 +----
 vllm/v1/engine/detokenizer.py      | 14 ++++++++------
 5 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index b67849038cf0d..1374768dc3def 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -242,7 +242,9 @@ async def async_request_openai_completions(
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
-            "ignore_eos": request_func_input.ignore_eos,
+            # "ignore_eos": request_func_input.ignore_eos,
+            "ignore_eos": False,
+
         }
         if request_func_input.extra_body:
             payload.update(request_func_input.extra_body)
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f76364f64033d..b44d72afae94a 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -424,7 +424,7 @@ def update_from_output(
                 # Check for stop and update request state.
                 # This must be called before me make the EngineCoreOutput.
                 stopped = self._check_stop(request)
-
+                
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
                     request_id=req_id,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d007712b6359e..f6a52213be965 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -286,9 +286,8 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
 
         for request_output in request_outputs:
             request_id = request_output.request_id
-            logger.debug("about to assert")
             if request_id not in self.request_streams:
-                raise ValueError("%s not found in Request Steams", request_id)
+                raise ValueError(f"{request_id} not in AsyncStreams")
 
             # Each request in the API server pulls from the per-request stream.
             stream = self.request_streams.get(request_id)
@@ -307,21 +306,17 @@ async def _run_output_handler(self):
         try:
             while True:
                 # 1) Pull outputs from the Detokenizer.
-                logger.debug("get_output_async")
                 request_outputs, reqs_to_abort = (
                     await self.detokenizer.get_output_async())
 
                 # 2) Put the RequestOutputs into the per-request AsyncStreams.
-                logger.debug("_process_request_outputs")
                 self._process_request_outputs(request_outputs)
 
                 # 3) Abort any requests that finished due to stop strings.
-                logger.debug("abort_requests_async")
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
                 # 4) Abort any requests due to client cancellations.
                 # TODO: send back to detokenizer if this fails.
-                logger.debug("process_cancellations")
                 await self._process_cancellations()
 
         except Exception as e:
@@ -356,7 +351,6 @@ async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
-        logger.debug("Called get_tokenizer.")
         return self.tokenizer.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8af753f4f39ab..30422891413c6 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -166,9 +166,7 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        logger.info("ABOUT TO SEND READINESS")
         with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
-            logger.info("SENDING READY SIGNAL")
             ready_socket.send_string(EngineCoreProc.READY_STR)
 
     @staticmethod
@@ -257,7 +255,7 @@ def run_busy_loop(self):
                         break
                     except queue.Empty:
                         self._log_stats()
-                        logger.info("EngineCore busy loop waiting.")
+                        logger.debug("EngineCore busy loop waiting.")
                     except BaseException:
                         raise
 
@@ -335,7 +333,6 @@ def process_output_socket(self, output_path: str):
         # Reuse send buffer.
         buffer = bytearray()
 
-        logger.info(f"{output_path=}")
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 engine_core_outputs = self.output_queue.get()
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 1caa656870ddc..a1c384cd45967 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -277,6 +277,9 @@ def step(
                 # Free completed requests.
                 if request_output.finished:
                     self.request_states.pop(request_id)
+                    # If Request finished but EngineCore not finished,
+                    # this was caused by a stop string + we need to send
+                    # an abort signal to the EngineCore.
                     if not engine_core_output.finished:
                         requests_to_abort.append(request_id)
 
@@ -395,9 +398,9 @@ def run_busy_loop(self):
                 zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket,
                 zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket):
 
-                # TODO: make this work without poll by having both EngineCore
-                # and AsyncLLM send to the same socket (unclear why this was not working
-                # when I originally tried it)
+                # TODO: avoid poll by having both EngineCore
+                # and AsyncLLM send to the same socket (unclear why this 
+                # was not working when I originally tried it)
                 poller = zmq.Poller()
                 poller.register(engine_core_outputs_socket, zmq.POLLIN)
                 poller.register(input_socket, zmq.POLLIN)
@@ -405,13 +408,13 @@ def run_busy_loop(self):
                 while True:
                     socks = dict(poller.poll())
 
-                    # Handle NewRequest
+                    # Handle NewRequest.
                     if input_socket in socks:
                         (frame, ) = input_socket.recv_multipart(copy=False)
                         detokenizer_request = decoder_new.decode(frame.buffer)
                         self.add_request(detokenizer_request)
 
-                    # Handle EngineCoreOutput
+                    # Handle EngineCoreOutput.
                     if engine_core_outputs_socket in socks:
                         (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False)
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
@@ -449,7 +452,6 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
             output_path,
             zmq.constants.PULL,
         )
-        self.output_socket.connect(output_path)        
 
         # Start Detokenizer in background process.
         self.proc_handle: Optional[BackgroundProcHandle]

From c980dbd50574b36edbd95cafb822652db09274e0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 16:34:11 +0000
Subject: [PATCH 005/132] workign e2e

---
 examples/openai_completion_client.py          |   8 +-
 vllm/entrypoints/openai/serving_completion.py |  10 +-
 vllm/v1/engine/async_llm.py                   | 198 ++++++++++--------
 vllm/v1/engine/core.py                        |   2 +-
 4 files changed, 124 insertions(+), 94 deletions(-)

diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
index 58519f978d340..1f8b82bc5c9e9 100644
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -2,7 +2,7 @@
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+openai_api_base = "http://localhost:8001/v1"
 
 client = OpenAI(
     # defaults to os.environ.get("OPENAI_API_KEY")
@@ -14,14 +14,12 @@
 model = models.data[0].id
 
 # Completion API
-stream = False
+stream = True
 completion = client.completions.create(
     model=model,
     prompt="A robot may not injure a human being",
     echo=False,
-    n=2,
-    stream=stream,
-    logprobs=3)
+    stream=stream)
 
 print("Completion results:")
 if stream:
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index bd39a4c42e938..d87c410c0124c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -159,8 +159,10 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators, is_cancelled=raw_request.is_disconnected)
+        # result_generator = merge_async_iterators(
+        #     *generators, is_cancelled=raw_request.is_disconnected)
+        assert len(generators) == 1
+        result_generator = generators[0]
 
         model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
@@ -256,7 +258,9 @@ async def completion_stream_generator(
             include_usage, include_continuous_usage = False, False
 
         try:
-            async for prompt_idx, res in result_generator:
+            # async for prompt_idx, res in result_generator:
+            async for res in result_generator:
+                prompt_idx = 0
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b36de5f66917c..0c5eda420b102 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -54,10 +54,11 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Request streams (map of request_id -> AsyncStream).
-        self.request_streams: Dict[str, AsyncStream] = {}
-        # List of cancelled request ids to be aborted.
-        self.client_aborted_requests: List[str] = []
+        # # Request streams (map of request_id -> AsyncStream).
+        # self.request_streams: Dict[str, AsyncStream] = {}
+        # # List of cancelled request ids to be aborted.
+        # self.client_aborted_requests: List[str] = []
+        self.rid_to_state = {}
 
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(vllm_config.model_config,
@@ -149,14 +150,18 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> asyncio.Event:
         """Add new request to the AsyncLLM."""
 
         if self.detokenizer.is_request_active(request_id):
             raise ValueError(f"Request {request_id} already exists.")
 
-        # 1) Create a new AsyncStream for the request.
-        stream = self._add_request_to_streams(request_id)
+        state = {
+            "out_list": [],
+            "event": asyncio.Event(),
+            "finished": False,
+        }
+        self.rid_to_state[request_id] = state
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
@@ -169,8 +174,7 @@ async def add_request(
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(engine_core_req)
 
-        # 5) Return the generator.
-        return stream.generator()
+        return state
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -209,87 +213,113 @@ async def generate(
             self.output_handler = asyncio.create_task(
                 self._run_output_handler())
 
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                sampling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                prompt_adapter_request=prompt_adapter_request,
-                priority=priority,
-        ):
-            yield output
-
-    def _finish_stream(self, request_id: str):
-        stream = self.request_streams.pop(request_id, None)
-        if stream is not None:
-            stream.finish()
-
-    def _add_request_to_streams(
-        self,
-        request_id: str,
-    ) -> AsyncStream:
-
-        if request_id in self.request_streams:
-            raise ValueError(f"Request id {request_id} already running.")
-
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        aborted_reqs = self.client_aborted_requests
-        stream = AsyncStream(request_id, aborted_reqs.append)
-        self.request_streams[request_id] = stream
-
-        if self.log_requests:
-            logger.info("Added request %s.", request_id)
-
-        return stream
-
-    async def _process_cancellations(self) -> None:
-        """
-        Process requests cancelled from user disconnecting.
-
-        When a client disconnects, AsyncStream._cancel() is called.
-        We passed a callback to AsyncStream(), which appends to 
-        self.client_aborted_requests.
-
-        As a result, if any requests are canceled from the user side
-        the request_id will show up in self.client_aborted_requests.
-        """
-
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        if not self.client_aborted_requests:
-            return
-        reqs_to_abort = self.client_aborted_requests.copy()
-        self.client_aborted_requests.clear()
-
-        # Remove from Detokenizer.
-        self.detokenizer.abort_requests(reqs_to_abort)
-
-        # Remove from RequestStreams.
-        for request_id in reqs_to_abort:
-            if self.log_requests:
-                logger.info("User-cancelled request %s.", request_id)
-            self._finish_stream(request_id)
-
-        # Remove from EngineCore.
-        await self.engine_core.abort_requests_async(reqs_to_abort)
+        state = await self.add_request(
+            request_id,
+            prompt,
+            sampling_params,
+            lora_request=lora_request,
+            trace_headers=trace_headers,
+            prompt_adapter_request=prompt_adapter_request,
+            priority=priority,
+        )
+        
+        while True:
+            try:
+                await asyncio.wait_for(state["event"].wait(), timeout=4)
+                out = state["out_list"][-1]
+            except asyncio.TimeoutError:
+                # if request is not None and await request.is_disconnected():
+                #     self.abort_request(obj.rid)
+                #     raise ValueError(f"Abort request {obj.rid}")
+                continue
+
+            state["out_list"] = []
+            if state["finished"]:
+                del self.rid_to_state[request_id]
+                yield out
+                break
+
+            state["event"].clear()
+            yield out
+
+    # def _finish_stream(self, request_id: str):
+    #     stream = self.request_streams.pop(request_id, None)
+    #     if stream is not None:
+    #         stream.finish()
+
+    # def _add_request_to_streams(
+    #     self,
+    #     request_id: str,
+    # ) -> AsyncStream:
+
+        # if request_id in self.request_streams:
+        #     raise ValueError(f"Request id {request_id} already running.")
+
+        # # Avoid streams having circular ref to parent AsyncLLM object.
+        # aborted_reqs = self.client_aborted_requests
+        # stream = AsyncStream(request_id, aborted_reqs.append)
+        # self.request_streams[request_id] = stream
+
+        # if self.log_requests:
+        #     logger.info("Added request %s.", request_id)
+
+        # return stream
+
+    # async def _process_cancellations(self) -> None:
+    #     """
+    #     Process requests cancelled from user disconnecting.
+
+    #     When a client disconnects, AsyncStream._cancel() is called.
+    #     We passed a callback to AsyncStream(), which appends to 
+    #     self.client_aborted_requests.
+
+    #     As a result, if any requests are canceled from the user side
+    #     the request_id will show up in self.client_aborted_requests.
+    #     """
+
+    #     # Avoid streams having circular ref to parent AsyncLLM object.
+    #     if not self.client_aborted_requests:
+    #         return
+    #     reqs_to_abort = self.client_aborted_requests.copy()
+    #     self.client_aborted_requests.clear()
+
+    #     # Remove from Detokenizer.
+    #     self.detokenizer.abort_requests(reqs_to_abort)
+
+    #     # Remove from RequestStreams.
+    #     for request_id in reqs_to_abort:
+    #         if self.log_requests:
+    #             logger.info("User-cancelled request %s.", request_id)
+    #         self._finish_stream(request_id)
+
+    #     # Remove from EngineCore.
+    #     await self.engine_core.abort_requests_async(reqs_to_abort)
 
     def _process_request_outputs(self, request_outputs: List[RequestOutput]):
         """Process outputs by putting them into per-request AsyncStreams."""
 
         for request_output in request_outputs:
             request_id = request_output.request_id
-            assert request_id in self.request_streams
+            assert request_id in self.rid_to_state
+            state = self.rid_to_state[request_id]
+
+            if request_output.finished:
+                state["finished"] = True
+            
+            state["out_list"].append(request_output)
+            state["event"].set()
 
-            # Each request in the API server pulls from the per-request stream.
-            stream = self.request_streams.get(request_id)
-            if stream is not None:
-                stream.put(request_output)
 
-                # If finished, remove from the tracker.
-                if request_output.finished:
-                    if self.log_requests:
-                        logger.info("Finished request %s.", request_id)
-                    self._finish_stream(request_id)
+            # # Each request in the API server pulls from the per-request stream.
+            # stream = self.request_streams.get(request_id)
+            # if stream is not None:
+            #     stream.put(request_output)
+
+            #     # If finished, remove from the tracker.
+            #     if request_output.finished:
+            #         if self.log_requests:
+            #             logger.info("Finished request %s.", request_id)
+            #         self._finish_stream(request_id)
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
@@ -309,14 +339,12 @@ async def _run_output_handler(self):
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
                 # 5) Abort any requests due to client cancellations.
-                await self._process_cancellations()
+                # await self._process_cancellations()
 
         except BaseException as e:
             logger.error(e)
             raise e
 
-    # TODO: can we eliminate these?
-
     async def abort(self, request_id: str) -> None:
         # Note: Who Calls this? I dont think this is actually used.
         raise ValueError("Not Supported on V1 yet.")
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 56d4dc67e4a0e..470ad80ee045c 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -32,7 +32,7 @@
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5000
+LOGGING_TIME_S = 5
 
 
 class EngineCore:

From cba2d54ae522f7ec43a1b3efb3e73bf1e4b8465b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 19:06:46 +0000
Subject: [PATCH 006/132] stash

---
 benchmarks/backend_request_func.py            |  2 +
 benchmarks/benchmark_serving.py               |  6 ++
 examples/openai_completion_client.py          |  2 +-
 vllm/entrypoints/openai/protocol.py           |  5 +-
 vllm/entrypoints/openai/serving_completion.py | 68 ++++++++++---------
 vllm/v1/engine/async_llm.py                   | 50 ++++++++------
 6 files changed, 77 insertions(+), 56 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index b67849038cf0d..7b324b9e9897a 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -40,6 +40,7 @@ class RequestFuncOutput:
     tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     error: str = ""
+    total_chunks: int = 0
 
 
 async def async_request_tgi(
@@ -269,6 +270,7 @@ async def async_request_openai_completions(
 
                         chunk = chunk_bytes.decode("utf-8").removeprefix(
                             "data: ")
+                        output.total_chunks += 1
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 4eb0e1f8ac903..96dc55a078960 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -630,6 +630,12 @@ async def limited_request_func(request_func_input, pbar):
                                      pbar=pbar)))
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
+    total_chunks = 0 
+    for output in outputs:
+        total_chunks += output.total_chunks
+
+    print(f"TOTAL_CHUNKS: {total_chunks}")
+    
     if profile:
         print("Stopping profiler...")
         profile_input = RequestFuncInput(
diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
index 1f8b82bc5c9e9..d905d18f151f0 100644
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -24,6 +24,6 @@
 print("Completion results:")
 if stream:
     for c in completion:
-        print(c)
+        print(c.choices[0].text)
 else:
     print(completion)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6ed7c2e9dcd6b..f0f04d2e512be 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -737,8 +737,9 @@ def to_sampling_params(
             logits_processors=get_logits_processors(self.logits_processors,
                                                     logits_processor_pattern),
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            output_kind=RequestOutputKind.DELTA if self.stream \
-                else RequestOutputKind.FINAL_ONLY,
+            # output_kind=RequestOutputKind.DELTA if self.stream \
+            #     else RequestOutputKind.FINAL_ONLY,
+            output_kind=RequestOutputKind.CUMULATIVE,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids)
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index d87c410c0124c..254937aab829c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -257,6 +257,8 @@ async def completion_stream_generator(
         else:
             include_usage, include_continuous_usage = False, False
 
+        streamed_text = ""
+
         try:
             # async for prompt_idx, res in result_generator:
             async for res in result_generator:
@@ -278,28 +280,29 @@ async def completion_stream_generator(
 
                     assert request.max_tokens is not None
                     if request.echo and not has_echoed[i]:
-                        assert prompt_token_ids is not None
-                        assert prompt_text is not None
-                        if request.max_tokens == 0:
-                            # only return the prompt
-                            delta_text = prompt_text
-                            delta_token_ids = prompt_token_ids
-                            out_logprobs = prompt_logprobs
-                        else:
-                            assert prompt_logprobs is not None
-                            # echo the prompt and first token
-                            delta_text = prompt_text + output.text
-                            delta_token_ids = [
-                                *prompt_token_ids, *output.token_ids
-                            ]
-                            out_logprobs = [
-                                *prompt_logprobs,
-                                *(output.logprobs or []),
-                            ]
-                        has_echoed[i] = True
+                        pass
+                        # assert prompt_token_ids is not None
+                        # assert prompt_text is not None
+                        # if request.max_tokens == 0:
+                        #     # only return the prompt
+                        #     delta_text = prompt_text
+                        #     delta_token_ids = prompt_token_ids
+                        #     out_logprobs = prompt_logprobs
+                        # else:
+                        #     assert prompt_logprobs is not None
+                        #     # echo the prompt and first token
+                        #     delta_text = prompt_text + output.text
+                        #     delta_token_ids = [
+                        #         *prompt_token_ids, *output.token_ids
+                        #     ]
+                        #     out_logprobs = [
+                        #         *prompt_logprobs,
+                        #         *(output.logprobs or []),
+                        #     ]
+                        # has_echoed[i] = True
                     else:
                         # return just the delta
-                        delta_text = output.text
+                        delta_text = output.text[previous_text_lens[i]:]
                         delta_token_ids = output.token_ids
                         out_logprobs = output.logprobs
 
@@ -309,20 +312,23 @@ async def completion_stream_generator(
                             continue
 
                     if request.logprobs is not None:
-                        assert out_logprobs is not None, (
-                            "Did not output logprobs")
-                        logprobs = self._create_completion_logprobs(
-                            token_ids=delta_token_ids,
-                            top_logprobs=out_logprobs,
-                            num_output_top_logprobs=request.logprobs,
-                            tokenizer=tokenizer,
-                            initial_text_offset=previous_text_lens[i],
-                        )
+                        pass
+                        # assert out_logprobs is not None, (
+                        #     "Did not output logprobs")
+                        # logprobs = self._create_completion_logprobs(
+                        #     token_ids=delta_token_ids,
+                        #     top_logprobs=out_logprobs,
+                        #     num_output_top_logprobs=request.logprobs,
+                        #     tokenizer=tokenizer,
+                        #     initial_text_offset=previous_text_lens[i],
+                        # )
                     else:
                         logprobs = None
 
-                    previous_text_lens[i] += len(output.text)
-                    previous_num_tokens[i] += len(output.token_ids)
+                    # previous_text_lens[i] += len(output.text)
+                    # previous_num_tokens[i] += len(output.token_ids)
+                    previous_text_lens[i] = len(output.text)
+                    previous_num_tokens[i] = len(output.token_ids)
                     finish_reason = output.finish_reason
                     stop_reason = output.stop_reason
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 0c5eda420b102..dd2ebb06bdf2e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,4 +1,5 @@
 import asyncio
+from dataclasses import dataclass
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -16,7 +17,6 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine.async_stream import AsyncStream
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -24,6 +24,17 @@
 
 logger = init_logger(__name__)
 
+@dataclass
+class RequestState:
+
+    event: asyncio.Event
+    out_list: List[RequestOutput]
+    finished: bool
+
+    @classmethod
+    def new(cls) -> "RequestState":
+        return cls(asyncio.Event(), [], False)
+
 
 class AsyncLLM(EngineClient):
 
@@ -46,6 +57,9 @@ def __init__(
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
+        # RequestId -> RequestState.
+        self.rid_to_state: Dict[str, RequestState] = {}
+
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
@@ -54,12 +68,6 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # # Request streams (map of request_id -> AsyncStream).
-        # self.request_streams: Dict[str, AsyncStream] = {}
-        # # List of cancelled request ids to be aborted.
-        # self.client_aborted_requests: List[str] = []
-        self.rid_to_state = {}
-
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(vllm_config.model_config,
                                    vllm_config.lora_config, self.tokenizer,
@@ -156,11 +164,9 @@ async def add_request(
         if self.detokenizer.is_request_active(request_id):
             raise ValueError(f"Request {request_id} already exists.")
 
-        state = {
-            "out_list": [],
-            "event": asyncio.Event(),
-            "finished": False,
-        }
+        # 1) Add to RequestState tracker. The "event" is used to manage
+        # concurrency between generate() and output_handler task.
+        state = RequestState.new()
         self.rid_to_state[request_id] = state
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
@@ -224,22 +230,22 @@ async def generate(
         )
         
         while True:
-            try:
-                await asyncio.wait_for(state["event"].wait(), timeout=4)
-                out = state["out_list"][-1]
+            try:        
+                await asyncio.wait_for(state.event.wait(), timeout=4)
+                out = state.out_list[-1]
             except asyncio.TimeoutError:
                 # if request is not None and await request.is_disconnected():
                 #     self.abort_request(obj.rid)
                 #     raise ValueError(f"Abort request {obj.rid}")
                 continue
 
-            state["out_list"] = []
-            if state["finished"]:
+            state.out_list = []
+            if state.finished:
                 del self.rid_to_state[request_id]
                 yield out
                 break
 
-            state["event"].clear()
+            state.event.clear()
             yield out
 
     # def _finish_stream(self, request_id: str):
@@ -304,10 +310,10 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
             state = self.rid_to_state[request_id]
 
             if request_output.finished:
-                state["finished"] = True
-            
-            state["out_list"].append(request_output)
-            state["event"].set()
+                state.finished = True
+
+            state.out_list.append(request_output)
+            state.event.set()
 
 
             # # Each request in the API server pulls from the per-request stream.

From 3ae44a8b53496ee54a3e200c612a3f8c03a366ee Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 22:40:36 +0000
Subject: [PATCH 007/132] stash

---
 examples/openai_completion_client.py          |  9 ++-
 vllm/entrypoints/openai/protocol.py           |  8 +-
 vllm/entrypoints/openai/serving_completion.py | 81 +++++++++----------
 vllm/v1/engine/async_llm.py                   |  6 +-
 4 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
index d905d18f151f0..bbc6f27ece2fe 100644
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -18,12 +18,17 @@
 completion = client.completions.create(
     model=model,
     prompt="A robot may not injure a human being",
-    echo=False,
+    echo=True,
+    n=1,
+    logprobs=2,
     stream=stream)
 
 print("Completion results:")
+text = ""
 if stream:
     for c in completion:
-        print(c.choices[0].text)
+        text += c.choices[0].text
+        print(c)
+    print(text)
 else:
     print(completion)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f0f04d2e512be..3ed78edd8f84e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -9,6 +9,7 @@
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Annotated
 
+from vllm.envs import VLLM_USE_V1
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
@@ -38,6 +39,11 @@
 assert _LONG_INFO.max == _MOCK_LONG_INFO.max
 
 
+STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.DELTA
+if VLLM_USE_V1:
+    STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.CUMULATIVE
+
+
 class OpenAIBaseModel(BaseModel):
     # OpenAI API does allow extra fields
     model_config = ConfigDict(extra="allow")
@@ -422,7 +428,7 @@ def to_sampling_params(
                                                     logits_processor_pattern),
             include_stop_str_in_output=self.include_stop_str_in_output,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            output_kind=RequestOutputKind.DELTA if self.stream \
+            output_kind=STREAM_SAMPLING_OUTPUT_KIND if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias)
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 254937aab829c..c7416cd6ac492 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -159,10 +159,8 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        # result_generator = merge_async_iterators(
-        #     *generators, is_cancelled=raw_request.is_disconnected)
-        assert len(generators) == 1
-        result_generator = generators[0]
+        result_generator = merge_async_iterators(
+            *generators, is_cancelled=raw_request.is_disconnected)
 
         model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
@@ -257,11 +255,10 @@ async def completion_stream_generator(
         else:
             include_usage, include_continuous_usage = False, False
 
-        streamed_text = ""
-
         try:
-            # async for prompt_idx, res in result_generator:
-            async for res in result_generator:
+            async for prompt_idx, res in result_generator:
+                # Output.text is cumulative
+
                 prompt_idx = 0
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs
@@ -280,31 +277,32 @@ async def completion_stream_generator(
 
                     assert request.max_tokens is not None
                     if request.echo and not has_echoed[i]:
-                        pass
-                        # assert prompt_token_ids is not None
-                        # assert prompt_text is not None
-                        # if request.max_tokens == 0:
-                        #     # only return the prompt
-                        #     delta_text = prompt_text
-                        #     delta_token_ids = prompt_token_ids
-                        #     out_logprobs = prompt_logprobs
-                        # else:
-                        #     assert prompt_logprobs is not None
-                        #     # echo the prompt and first token
-                        #     delta_text = prompt_text + output.text
-                        #     delta_token_ids = [
-                        #         *prompt_token_ids, *output.token_ids
-                        #     ]
-                        #     out_logprobs = [
-                        #         *prompt_logprobs,
-                        #         *(output.logprobs or []),
-                        #     ]
-                        # has_echoed[i] = True
+                        assert prompt_token_ids is not None
+                        assert prompt_text is not None
+                        # If we not echoed, we have not sent text yet.
+                        assert previous_text_lens[i] == 0
+                        assert previous_num_tokens[i] == 0
+                        if request.max_tokens == 0:
+                            # only return the prompt
+                            delta_text = prompt_text
+                            delta_token_ids = prompt_token_ids
+                            out_logprobs = prompt_logprobs
+                        else:
+                            assert prompt_logprobs is not None
+                            # echo the prompt and first token
+                            delta_text = prompt_text + output.text
+                            delta_token_ids = [
+                                *prompt_token_ids, *output.token_ids
+                            ]
+                            out_logprobs = [
+                                *prompt_logprobs,
+                                *(output.logprobs or []),
+                            ]
+                        has_echoed[i] = True
                     else:
-                        # return just the delta
                         delta_text = output.text[previous_text_lens[i]:]
-                        delta_token_ids = output.token_ids
-                        out_logprobs = output.logprobs
+                        delta_token_ids = output.token_ids[previous_num_tokens[i]:]
+                        out_logprobs = output.logprobs[previous_num_tokens[i]:] if output.logprobs else None
 
                         if not delta_text and not delta_token_ids \
                             and not previous_num_tokens[i]:
@@ -312,21 +310,18 @@ async def completion_stream_generator(
                             continue
 
                     if request.logprobs is not None:
-                        pass
-                        # assert out_logprobs is not None, (
-                        #     "Did not output logprobs")
-                        # logprobs = self._create_completion_logprobs(
-                        #     token_ids=delta_token_ids,
-                        #     top_logprobs=out_logprobs,
-                        #     num_output_top_logprobs=request.logprobs,
-                        #     tokenizer=tokenizer,
-                        #     initial_text_offset=previous_text_lens[i],
-                        # )
+                        assert out_logprobs is not None, (
+                            "Did not output logprobs")
+                        logprobs = self._create_completion_logprobs(
+                            token_ids=delta_token_ids,
+                            top_logprobs=out_logprobs,
+                            num_output_top_logprobs=request.logprobs,
+                            tokenizer=tokenizer,
+                            initial_text_offset=previous_text_lens[i],
+                        )
                     else:
                         logprobs = None
 
-                    # previous_text_lens[i] += len(output.text)
-                    # previous_num_tokens[i] += len(output.token_ids)
                     previous_text_lens[i] = len(output.text)
                     previous_num_tokens[i] = len(output.token_ids)
                     finish_reason = output.finish_reason
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index dd2ebb06bdf2e..d78ac92127ef0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -24,6 +24,8 @@
 
 logger = init_logger(__name__)
 
+WAITING_TIMEOUT_MS=5
+
 @dataclass
 class RequestState:
 
@@ -231,9 +233,11 @@ async def generate(
         
         while True:
             try:        
-                await asyncio.wait_for(state.event.wait(), timeout=4)
+                await asyncio.wait_for(state.event.wait(), timeout=WAITING_TIMEOUT_MS)
                 out = state.out_list[-1]
+
             except asyncio.TimeoutError:
+                logger.debug("Timeout waiting for %s", request_id)
                 # if request is not None and await request.is_disconnected():
                 #     self.abort_request(obj.rid)
                 #     raise ValueError(f"Abort request {obj.rid}")

From 3ef56872f00324fdf5d01c9786f29f874690a25d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 22:43:06 +0000
Subject: [PATCH 008/132] remove async stream

---
 vllm/v1/engine/async_stream.py | 55 ----------------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 vllm/v1/engine/async_stream.py

diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py
deleted file mode 100644
index 35449238c3259..0000000000000
--- a/vllm/v1/engine/async_stream.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import asyncio
-from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
-
-from vllm.outputs import PoolingRequestOutput, RequestOutput
-
-
-class AsyncStream:
-    """A stream of RequestOutputs or PoolingRequestOutputs for a request
-    that can be iterated over asynchronously via an async generator."""
-
-    STOP_ITERATION = Exception()  # Sentinel
-
-    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
-        self.request_id = request_id
-        self._cancel = cancel
-        self._queue: asyncio.Queue = asyncio.Queue()
-        self._finished = False
-
-    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
-                              Exception]) -> None:
-        if not self._finished:
-            self._queue.put_nowait(item)
-
-    def finish(
-        self,
-        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
-    ) -> None:
-        if not self._finished:
-            self._finished = True
-            self._queue.put_nowait(exception if self._is_raisable(exception)
-                                   else AsyncStream.STOP_ITERATION)
-
-    async def generator(
-        self
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
-        finished = False
-        try:
-            while True:
-                result = await self._queue.get()
-                if self._is_raisable(result):
-                    finished = True
-                    if result == AsyncStream.STOP_ITERATION:
-                        return
-                    raise result
-                yield result
-        finally:
-            self._finished = True
-            if not finished:
-                self._cancel(self.request_id)
-
-    @staticmethod
-    def _is_raisable(value: Any):
-        return isinstance(value, BaseException) or \
-                (isinstance(value, type) and \
-                 issubclass(value, BaseException))

From b350084e509276081331597ab9c8c36e3465aec9 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 22:54:51 +0000
Subject: [PATCH 009/132] fix protocol

---
 vllm/entrypoints/openai/protocol.py |  5 +-
 vllm/v1/engine/async_llm.py         | 96 ++++++++++++-----------------
 2 files changed, 42 insertions(+), 59 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 3ed78edd8f84e..e3c6a33ef430d 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -743,9 +743,8 @@ def to_sampling_params(
             logits_processors=get_logits_processors(self.logits_processors,
                                                     logits_processor_pattern),
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            # output_kind=RequestOutputKind.DELTA if self.stream \
-            #     else RequestOutputKind.FINAL_ONLY,
-            output_kind=RequestOutputKind.CUMULATIVE,
+            output_kind=STREAM_SAMPLING_OUTPUT_KIND if self.stream \
+                else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d78ac92127ef0..2668190a46d03 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -201,17 +201,17 @@ async def generate(
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
-            * 1) Making an AsyncStream corresponding to the Request.
+            * 1) Making an RequestState corresponding to the Request.
             # 2) Processing the Input.
             * 3) Adding the Request to the Detokenizer.
             * 4) Adding the Request to the EngineCore (separate process).
 
-        A separate output_handler loop runs in a background AsyncIO task, 
-        pulling outputs from EngineCore and putting them into the 
-        per-request AsyncStream.
+        A separate output_handler loop runs in a background task, 
+        pulling outputs from EngineCore and updating the RequestState and
+        setting the asyncio Event.
 
-        The caller of generate() iterates the returned AsyncGenerator,
-        returning the RequestOutput back to the caller.
+        The caller of generate() waits on the asyncio event and forwards
+        the latest RequestOutput back to the caller.
         """
 
         # We start the output_handler on the first call to generate() so that
@@ -232,15 +232,22 @@ async def generate(
         )
         
         while True:
-            try:        
+            try:
                 await asyncio.wait_for(state.event.wait(), timeout=WAITING_TIMEOUT_MS)
+
+                # NOTE(rob): out_list can have more than one item. However, in the 
+                # streaming case, we use RequestOutputKind.CUMULATIVE, which has the 
+                # full generated text output (not just the text corresponding to the
+                # last token). So, we can just send the last item and the API Client
+                # handles converting the stream buffer into a delta text. This way
+                # we do "dynamic chunked streaming", such that the API client does not
+                # fall behind the EngineCore (which happens at high QPS othwerwise).
                 out = state.out_list[-1]
 
             except asyncio.TimeoutError:
                 logger.debug("Timeout waiting for %s", request_id)
-                # if request is not None and await request.is_disconnected():
-                #     self.abort_request(obj.rid)
-                #     raise ValueError(f"Abort request {obj.rid}")
+                
+                # TODO (rob): do request cancellation checking here.
                 continue
 
             state.out_list = []
@@ -252,58 +259,35 @@ async def generate(
             state.event.clear()
             yield out
 
-    # def _finish_stream(self, request_id: str):
-    #     stream = self.request_streams.pop(request_id, None)
-    #     if stream is not None:
-    #         stream.finish()
-
-    # def _add_request_to_streams(
-    #     self,
-    #     request_id: str,
-    # ) -> AsyncStream:
-
-        # if request_id in self.request_streams:
-        #     raise ValueError(f"Request id {request_id} already running.")
-
-        # # Avoid streams having circular ref to parent AsyncLLM object.
-        # aborted_reqs = self.client_aborted_requests
-        # stream = AsyncStream(request_id, aborted_reqs.append)
-        # self.request_streams[request_id] = stream
-
-        # if self.log_requests:
-        #     logger.info("Added request %s.", request_id)
-
-        # return stream
-
-    # async def _process_cancellations(self) -> None:
-    #     """
-    #     Process requests cancelled from user disconnecting.
+    async def _process_cancellations(self) -> None:
+        """
+        Process requests cancelled from user disconnecting.
 
-    #     When a client disconnects, AsyncStream._cancel() is called.
-    #     We passed a callback to AsyncStream(), which appends to 
-    #     self.client_aborted_requests.
+        When a client disconnects, AsyncStream._cancel() is called.
+        We passed a callback to AsyncStream(), which appends to 
+        self.client_aborted_requests.
 
-    #     As a result, if any requests are canceled from the user side
-    #     the request_id will show up in self.client_aborted_requests.
-    #     """
+        As a result, if any requests are canceled from the user side
+        the request_id will show up in self.client_aborted_requests.
+        """
 
-    #     # Avoid streams having circular ref to parent AsyncLLM object.
-    #     if not self.client_aborted_requests:
-    #         return
-    #     reqs_to_abort = self.client_aborted_requests.copy()
-    #     self.client_aborted_requests.clear()
+        # Avoid streams having circular ref to parent AsyncLLM object.
+        if not self.client_aborted_requests:
+            return
+        reqs_to_abort = self.client_aborted_requests.copy()
+        self.client_aborted_requests.clear()
 
-    #     # Remove from Detokenizer.
-    #     self.detokenizer.abort_requests(reqs_to_abort)
+        # Remove from Detokenizer.
+        self.detokenizer.abort_requests(reqs_to_abort)
 
-    #     # Remove from RequestStreams.
-    #     for request_id in reqs_to_abort:
-    #         if self.log_requests:
-    #             logger.info("User-cancelled request %s.", request_id)
-    #         self._finish_stream(request_id)
+        # Remove from RequestStreams.
+        for request_id in reqs_to_abort:
+            if self.log_requests:
+                logger.info("User-cancelled request %s.", request_id)
+            self._finish_stream(request_id)
 
-    #     # Remove from EngineCore.
-    #     await self.engine_core.abort_requests_async(reqs_to_abort)
+        # Remove from EngineCore.
+        await self.engine_core.abort_requests_async(reqs_to_abort)
 
     def _process_request_outputs(self, request_outputs: List[RequestOutput]):
         """Process outputs by putting them into per-request AsyncStreams."""

From abd7fa39adcab529f6f6cfa668916a831075d2f2 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 22:55:35 +0000
Subject: [PATCH 010/132] clean up completion client

---
 vllm/entrypoints/openai/serving_completion.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index c7416cd6ac492..1f0a5db439580 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -257,9 +257,6 @@ async def completion_stream_generator(
 
         try:
             async for prompt_idx, res in result_generator:
-                # Output.text is cumulative
-
-                prompt_idx = 0
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt

From 6986457965b559534cbc2d3d9879113e2ea31ecd Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:08:18 +0000
Subject: [PATCH 011/132] stash

---
 vllm/v1/engine/async_llm.py | 39 ++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 2668190a46d03..c49a55e9d1139 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -13,7 +13,7 @@
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import SamplingParams, RequestOutputKind
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
@@ -28,6 +28,10 @@
 
 @dataclass
 class RequestState:
+    """RequestState manages concurrency between the output_handler,
+    which pulls outputs from EngineCore and the user-facing generate()
+    function the 
+    """
 
     event: asyncio.Event
     out_list: List[RequestOutput]
@@ -59,9 +63,6 @@ def __init__(
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
-        # RequestId -> RequestState.
-        self.rid_to_state: Dict[str, RequestState] = {}
-
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
@@ -70,6 +71,11 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
+        # RequestId -> RequestState.
+        self.rid_to_state: Dict[str, RequestState] = {}
+        # List of cancelled request ids to be aborted.
+        self.client_aborted_requests: List[str] = []
+
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(vllm_config.model_config,
                                    vllm_config.lora_config, self.tokenizer,
@@ -160,16 +166,15 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> asyncio.Event:
+    ) -> RequestState:
         """Add new request to the AsyncLLM."""
 
         if self.detokenizer.is_request_active(request_id):
             raise ValueError(f"Request {request_id} already exists.")
 
         # 1) Add to RequestState tracker. The "event" is used to manage
-        # concurrency between generate() and output_handler task.
-        state = RequestState.new()
-        self.rid_to_state[request_id] = state
+        # concurrency between generate() and output_handler().
+        self.rid_to_state[request_id] = RequestState.new()
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
@@ -182,7 +187,7 @@ async def add_request(
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(engine_core_req)
 
-        return state
+        return self.rid_to_state[request_id]
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -201,19 +206,22 @@ async def generate(
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
-            * 1) Making an RequestState corresponding to the Request.
+            * 1) Make RequestState corresponding to the Request.
             # 2) Processing the Input.
             * 3) Adding the Request to the Detokenizer.
             * 4) Adding the Request to the EngineCore (separate process).
 
-        A separate output_handler loop runs in a background task, 
-        pulling outputs from EngineCore and updating the RequestState and
-        setting the asyncio Event.
+        The output_handler() loop runs in a background task, pulling outputs from
+        EngineCore and updating the RequestState and setting the asyncio event.
 
         The caller of generate() waits on the asyncio event and forwards
-        the latest RequestOutput back to the caller.
+        the latest RequestOutput back to the caller. 
         """
 
+        # DELTA streaming is not supported due to dynamic chunking.
+        assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE or
+                sampling_params.output_kind == RequestOutputKind.FINAL_ONLY)
+
         # We start the output_handler on the first call to generate() so that
         # we can call __init__ before the event loop starts, which enables us
         # to handle startup failure gracefully in the OpenAI server.
@@ -245,9 +253,8 @@ async def generate(
                 out = state.out_list[-1]
 
             except asyncio.TimeoutError:
+                # TODO(rob): do request cancellation checking here.
                 logger.debug("Timeout waiting for %s", request_id)
-                
-                # TODO (rob): do request cancellation checking here.
                 continue
 
             state.out_list = []

From 816e9658a7122cee6eab3e43e5cc45fdf2250e03 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:09:03 +0000
Subject: [PATCH 012/132] updated

---
 vllm/v1/engine/async_llm.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c49a55e9d1139..3b755935909e4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -310,18 +310,6 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
             state.out_list.append(request_output)
             state.event.set()
 
-
-            # # Each request in the API server pulls from the per-request stream.
-            # stream = self.request_streams.get(request_id)
-            # if stream is not None:
-            #     stream.put(request_output)
-
-            #     # If finished, remove from the tracker.
-            #     if request_output.finished:
-            #         if self.log_requests:
-            #             logger.info("Finished request %s.", request_id)
-            #         self._finish_stream(request_id)
-
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 

From cebf2870b517a10430e9b2faeb28c9516131291d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:11:35 +0000
Subject: [PATCH 013/132] updated comment

---
 vllm/v1/engine/async_llm.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3b755935909e4..c1e6b8940a9c4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -300,16 +300,17 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
         """Process outputs by putting them into per-request AsyncStreams."""
 
         for request_output in request_outputs:
-            request_id = request_output.request_id
-            assert request_id in self.rid_to_state
-            state = self.rid_to_state[request_id]
-
+            assert request_output.request_id in self.rid_to_state
+            
+            # Update the RequestState and alert generate() that there
+            # is a RequestOutput ready to return to the user.
+            state = self.rid_to_state[request_output.request_id]
             if request_output.finished:
                 state.finished = True
-
             state.out_list.append(request_output)
             state.event.set()
 
+
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 

From adcc3d291fa22e81f838bbafc52516a2d9b7c5a9 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:28:35 +0000
Subject: [PATCH 014/132] remove comptibility

---
 vllm/entrypoints/openai/serving_completion.py | 56 +++++++++++++++----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1f0a5db439580..99878be97acaf 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -27,7 +27,8 @@
                                                     PromptAdapterPath)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.sampling_params import (BeamSearchParams, SamplingParams, 
+                                  RequestOutputKind)
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import merge_async_iterators
@@ -182,7 +183,9 @@ async def create_completion(
                 model_name,
                 num_prompts=num_prompts,
                 tokenizer=tokenizer,
-                request_metadata=request_metadata)
+                request_metadata=request_metadata,
+                output_kind=sampling_params.output_kind,
+            )
 
         # Non-streaming response
         final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts
@@ -240,7 +243,32 @@ async def completion_stream_generator(
         num_prompts: int,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
+        output_kind: RequestOutputKind,
     ) -> AsyncGenerator[str, None]:
+        """
+        In V0, we use RequestOutputType.DELTA and each RequestOutput
+            from the result_generator is guarenteed to correspond to
+            a single token.
+
+            To handle this, we can simply constuct the Streaming
+
+        In V1, we use RequestOutputType.CUMULATIVE and each RequestOutput
+            from the result_genrator is not guarenteed to correspond to
+            a single token (it could correspond to 2+ tokens).
+
+            To handle this, we need to maintain state around how many
+            characters and tokens have been returned so far, and dynamically
+            stream back just the delta (where the delta could be the text
+            corresponding to N tokens).
+
+            We do this to dynamically adjust how much work the API server
+            is doing. If the QPS is high and streaming becomes a bottleneck,
+            such that the API server falls behind, we dynamically fall back
+            to streaming chunks of tokens.
+        """
+        assert (output_kind == RequestOutputKind.CUMULATIVE or
+                output_kind == RequestOutputKind.DELTA)
+
         num_choices = 1 if request.n is None else request.n
         previous_text_lens = [0] * num_choices * num_prompts
         previous_num_tokens = [0] * num_choices * num_prompts
@@ -276,9 +304,6 @@ async def completion_stream_generator(
                     if request.echo and not has_echoed[i]:
                         assert prompt_token_ids is not None
                         assert prompt_text is not None
-                        # If we not echoed, we have not sent text yet.
-                        assert previous_text_lens[i] == 0
-                        assert previous_num_tokens[i] == 0
                         if request.max_tokens == 0:
                             # only return the prompt
                             delta_text = prompt_text
@@ -297,9 +322,15 @@ async def completion_stream_generator(
                             ]
                         has_echoed[i] = True
                     else:
-                        delta_text = output.text[previous_text_lens[i]:]
-                        delta_token_ids = output.token_ids[previous_num_tokens[i]:]
-                        out_logprobs = output.logprobs[previous_num_tokens[i]:] if output.logprobs else None
+                        if output_kind == RequestOutputKind.CUMULATIVE:
+                            delta_text = output.text[previous_text_lens[i]:]
+                            delta_token_ids = output.token_ids[previous_num_tokens[i]:]
+                            out_logprobs = (output.logprobs[previous_num_tokens[i]:] if 
+                                            output.logprobs else None)
+                        else:
+                            delta_text = output.text
+                            delta_token_ids = output.token_ids
+                            out_logprobs = output.logprobs
 
                         if not delta_text and not delta_token_ids \
                             and not previous_num_tokens[i]:
@@ -319,8 +350,13 @@ async def completion_stream_generator(
                     else:
                         logprobs = None
 
-                    previous_text_lens[i] = len(output.text)
-                    previous_num_tokens[i] = len(output.token_ids)
+                    if output_kind == RequestOutputKind.CUMULATIVE:
+                        previous_text_lens[i] = len(output.text)
+                        previous_num_tokens[i] = len(output.token_ids)
+                    else:
+                        previous_text_lens[i] += len(output.text)
+                        previous_num_tokens[i] += len(output.token_ids)
+
                     finish_reason = output.finish_reason
                     stop_reason = output.stop_reason
 

From 4344f1bb986abc31f7ba0a8b292878d62b9c1f30 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:34:07 +0000
Subject: [PATCH 015/132] format

---
 benchmarks/benchmark_serving.py               |  4 +-
 vllm/entrypoints/openai/protocol.py           |  3 +-
 vllm/entrypoints/openai/serving_completion.py | 25 ++++++------
 vllm/v1/engine/async_llm.py                   | 38 ++++++++++---------
 4 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 96dc55a078960..e9a9ef366004a 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -630,12 +630,12 @@ async def limited_request_func(request_func_input, pbar):
                                      pbar=pbar)))
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    total_chunks = 0 
+    total_chunks = 0
     for output in outputs:
         total_chunks += output.total_chunks
 
     print(f"TOTAL_CHUNKS: {total_chunks}")
-    
+
     if profile:
         print("Stopping profiler...")
         profile_input = RequestFuncInput(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index e3c6a33ef430d..cfc02013dd8c5 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -9,8 +9,8 @@
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Annotated
 
-from vllm.envs import VLLM_USE_V1
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.envs import VLLM_USE_V1
 from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
@@ -38,7 +38,6 @@
 assert _LONG_INFO.min == _MOCK_LONG_INFO.min
 assert _LONG_INFO.max == _MOCK_LONG_INFO.max
 
-
 STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.DELTA
 if VLLM_USE_V1:
     STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.CUMULATIVE
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 99878be97acaf..f8f00d53bd30f 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -27,8 +27,8 @@
                                                     PromptAdapterPath)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import (BeamSearchParams, SamplingParams, 
-                                  RequestOutputKind)
+from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
+                                  SamplingParams)
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import merge_async_iterators
@@ -172,6 +172,7 @@ async def create_completion(
         stream = (request.stream
                   and (request.best_of is None or request.n == request.best_of)
                   and not request.use_beam_search)
+        assert isinstance(sampling_params, SamplingParams)
 
         # Streaming response
         if stream:
@@ -247,13 +248,11 @@ async def completion_stream_generator(
     ) -> AsyncGenerator[str, None]:
         """
         In V0, we use RequestOutputType.DELTA and each RequestOutput
-            from the result_generator is guarenteed to correspond to
-            a single token.
-
-            To handle this, we can simply constuct the Streaming
+            from the result_generator is guaranteed to correspond to
+            a single token so 
 
         In V1, we use RequestOutputType.CUMULATIVE and each RequestOutput
-            from the result_genrator is not guarenteed to correspond to
+            from the result_generator is not guaranteed to correspond to
             a single token (it could correspond to 2+ tokens).
 
             To handle this, we need to maintain state around how many
@@ -266,8 +265,8 @@ async def completion_stream_generator(
             such that the API server falls behind, we dynamically fall back
             to streaming chunks of tokens.
         """
-        assert (output_kind == RequestOutputKind.CUMULATIVE or
-                output_kind == RequestOutputKind.DELTA)
+        assert (output_kind == RequestOutputKind.CUMULATIVE
+                or output_kind == RequestOutputKind.DELTA)
 
         num_choices = 1 if request.n is None else request.n
         previous_text_lens = [0] * num_choices * num_prompts
@@ -324,9 +323,11 @@ async def completion_stream_generator(
                     else:
                         if output_kind == RequestOutputKind.CUMULATIVE:
                             delta_text = output.text[previous_text_lens[i]:]
-                            delta_token_ids = output.token_ids[previous_num_tokens[i]:]
-                            out_logprobs = (output.logprobs[previous_num_tokens[i]:] if 
-                                            output.logprobs else None)
+                            delta_token_ids = output.token_ids[
+                                previous_num_tokens[i]:]
+                            out_logprobs = (
+                                output.logprobs[previous_num_tokens[i]:]
+                                if output.logprobs else None)
                         else:
                             delta_text = output.text
                             delta_token_ids = output.token_ids
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c1e6b8940a9c4..434378cc4ada0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -10,10 +10,10 @@
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams, RequestOutputKind
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
@@ -24,7 +24,8 @@
 
 logger = init_logger(__name__)
 
-WAITING_TIMEOUT_MS=5
+WAITING_TIMEOUT_MS = 5
+
 
 @dataclass
 class RequestState:
@@ -211,7 +212,7 @@ async def generate(
             * 3) Adding the Request to the Detokenizer.
             * 4) Adding the Request to the EngineCore (separate process).
 
-        The output_handler() loop runs in a background task, pulling outputs from
+        The output_handler() loop runs in a background task, pulling from
         EngineCore and updating the RequestState and setting the asyncio event.
 
         The caller of generate() waits on the asyncio event and forwards
@@ -219,8 +220,8 @@ async def generate(
         """
 
         # DELTA streaming is not supported due to dynamic chunking.
-        assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE or
-                sampling_params.output_kind == RequestOutputKind.FINAL_ONLY)
+        assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE
+                or sampling_params.output_kind == RequestOutputKind.FINAL_ONLY)
 
         # We start the output_handler on the first call to generate() so that
         # we can call __init__ before the event loop starts, which enables us
@@ -238,18 +239,20 @@ async def generate(
             prompt_adapter_request=prompt_adapter_request,
             priority=priority,
         )
-        
+
         while True:
             try:
-                await asyncio.wait_for(state.event.wait(), timeout=WAITING_TIMEOUT_MS)
-
-                # NOTE(rob): out_list can have more than one item. However, in the 
-                # streaming case, we use RequestOutputKind.CUMULATIVE, which has the 
-                # full generated text output (not just the text corresponding to the
-                # last token). So, we can just send the last item and the API Client
-                # handles converting the stream buffer into a delta text. This way
-                # we do "dynamic chunked streaming", such that the API client does not
-                # fall behind the EngineCore (which happens at high QPS othwerwise).
+                await asyncio.wait_for(state.event.wait(),
+                                       timeout=WAITING_TIMEOUT_MS)
+
+                # NOTE(rob): out_list can have more than one item. However,
+                # in the streaming case, we use RequestOutputKind.CUMULATIVE,
+                # which has the full generated text output (not just the text
+                # corresponding to the last token). So, we can just send the
+                # last RequestOutput and the API Client handles converting into
+                # a delta text. This way we do "dynamic chunked streaming", such
+                # that the API client does not fall behind the EngineCor,
+                # which happens at high QPS otherwise.
                 out = state.out_list[-1]
 
             except asyncio.TimeoutError:
@@ -301,7 +304,7 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
 
         for request_output in request_outputs:
             assert request_output.request_id in self.rid_to_state
-            
+
             # Update the RequestState and alert generate() that there
             # is a RequestOutput ready to return to the user.
             state = self.rid_to_state[request_output.request_id]
@@ -310,7 +313,6 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
             state.out_list.append(request_output)
             state.event.set()
 
-
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 

From d7b42a02125d9a2efdf3053d6376d4accfb90c58 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:38:18 +0000
Subject: [PATCH 016/132] format/comments

---
 vllm/v1/engine/async_llm.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 434378cc4ada0..e03d502c258b8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -29,18 +29,24 @@
 
 @dataclass
 class RequestState:
-    """RequestState manages concurrency between the output_handler,
-    which pulls outputs from EngineCore and the user-facing generate()
-    function the 
+    """
+    RequestState manages concurrency between:
+        * the output_handler(), which pulls outputs from EngineCore
+        * the per-request generate(), which is the interface to client code.
+
+    The output_handler adds new RequestOutputs to out_list and sets the 
+    asyncio event, notifying the generate() that there is work to do.
+
+    generate() waits on the asyncio event and yields the data from 
+    out_list back to the caller generate()
     """
 
     event: asyncio.Event
     out_list: List[RequestOutput]
-    finished: bool
 
     @classmethod
     def new(cls) -> "RequestState":
-        return cls(asyncio.Event(), [], False)
+        return cls(asyncio.Event(), [])
 
 
 class AsyncLLM(EngineClient):
@@ -261,7 +267,7 @@ async def generate(
                 continue
 
             state.out_list = []
-            if state.finished:
+            if out.finished:
                 del self.rid_to_state[request_id]
                 yield out
                 break
@@ -308,8 +314,6 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
             # Update the RequestState and alert generate() that there
             # is a RequestOutput ready to return to the user.
             state = self.rid_to_state[request_output.request_id]
-            if request_output.finished:
-                state.finished = True
             state.out_list.append(request_output)
             state.event.set()
 

From c987a763e53917064e6fa3a3ff08fb00fa2d0e66 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:39:05 +0000
Subject: [PATCH 017/132] update comment

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index e03d502c258b8..1cac46bc7d418 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -32,7 +32,7 @@ class RequestState:
     """
     RequestState manages concurrency between:
         * the output_handler(), which pulls outputs from EngineCore
-        * the per-request generate(), which is the interface to client code.
+        * the per-request generate(), which yields to the API server
 
     The output_handler adds new RequestOutputs to out_list and sets the 
     asyncio event, notifying the generate() that there is work to do.

From f3ff0e070507eba56f8d67dd40b59e2a4edfc550 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:40:01 +0000
Subject: [PATCH 018/132] format

---
 vllm/entrypoints/openai/serving_completion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index f8f00d53bd30f..7a40c312f49ce 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -249,7 +249,8 @@ async def completion_stream_generator(
         """
         In V0, we use RequestOutputType.DELTA and each RequestOutput
             from the result_generator is guaranteed to correspond to
-            a single token so 
+            a single token so can construct the outputs without needing
+            to maintain any state.
 
         In V1, we use RequestOutputType.CUMULATIVE and each RequestOutput
             from the result_generator is not guaranteed to correspond to

From fbf647f5308451539f84fcc86c77d889d5ac4985 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:41:20 +0000
Subject: [PATCH 019/132] updated examples

---
 examples/openai_completion_client.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
index bbc6f27ece2fe..205755da1c34f 100644
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -2,33 +2,29 @@
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8001/v1"
+openai_api_base = "http://localhost:8000/v1"
 
 client = OpenAI(
     # defaults to os.environ.get("OPENAI_API_KEY")
     api_key=openai_api_key,
     base_url=openai_api_base,
 )
-
 models = client.models.list()
 model = models.data[0].id
 
 # Completion API
-stream = True
+stream = False
 completion = client.completions.create(
     model=model,
     prompt="A robot may not injure a human being",
-    echo=True,
-    n=1,
-    logprobs=2,
-    stream=stream)
+    echo=False,
+    n=2,
+    stream=stream,
+    logprobs=3)
 
 print("Completion results:")
-text = ""
 if stream:
     for c in completion:
-        text += c.choices[0].text
         print(c)
-    print(text)
 else:
     print(completion)

From b1105b926e648c4646fe843951d07353b25bc14e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:42:15 +0000
Subject: [PATCH 020/132] more cleaning

---
 benchmarks/backend_request_func.py | 2 --
 benchmarks/benchmark_serving.py    | 6 ------
 2 files changed, 8 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 7b324b9e9897a..b67849038cf0d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -40,7 +40,6 @@ class RequestFuncOutput:
     tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     error: str = ""
-    total_chunks: int = 0
 
 
 async def async_request_tgi(
@@ -270,7 +269,6 @@ async def async_request_openai_completions(
 
                         chunk = chunk_bytes.decode("utf-8").removeprefix(
                             "data: ")
-                        output.total_chunks += 1
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e9a9ef366004a..4eb0e1f8ac903 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -630,12 +630,6 @@ async def limited_request_func(request_func_input, pbar):
                                      pbar=pbar)))
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    total_chunks = 0
-    for output in outputs:
-        total_chunks += output.total_chunks
-
-    print(f"TOTAL_CHUNKS: {total_chunks}")
-
     if profile:
         print("Stopping profiler...")
         profile_input = RequestFuncInput(

From ea7289bb43c6e5db534bc8c3a31ecd451cd080c5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 16 Dec 2024 23:42:36 +0000
Subject: [PATCH 021/132] make pr smaller

---
 examples/openai_completion_client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
index 205755da1c34f..58519f978d340 100644
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -9,6 +9,7 @@
     api_key=openai_api_key,
     base_url=openai_api_base,
 )
+
 models = client.models.list()
 model = models.data[0].id
 

From 06dcb1b8be638e1e03df355ae775cf938a2efe4e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 17 Dec 2024 02:19:44 +0000
Subject: [PATCH 022/132] updated

---
 examples/openai_completion_client.py | 2 +-
 vllm/v1/engine/core.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
index 58519f978d340..2c8525fd392e1 100644
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -14,7 +14,7 @@
 model = models.data[0].id
 
 # Completion API
-stream = False
+stream = True
 completion = client.completions.create(
     model=model,
     prompt="A robot may not injure a human being",
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 470ad80ee045c..9e8b47d39463f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -32,7 +32,7 @@
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5
+LOGGING_TIME_S = 1
 
 
 class EngineCore:

From 9628575d6fea679ed6b023d44d582ce0db330532 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 17 Dec 2024 02:21:10 +0000
Subject: [PATCH 023/132] added log

---
 vllm/v1/engine/async_llm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1cac46bc7d418..a05cf775abaac 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -259,6 +259,8 @@ async def generate(
                 # a delta text. This way we do "dynamic chunked streaming", such
                 # that the API client does not fall behind the EngineCor,
                 # which happens at high QPS otherwise.
+                if len(state.out_list) > 1:
+                    logger.info(f"{len(state.out_list)=}")
                 out = state.out_list[-1]
 
             except asyncio.TimeoutError:

From 5d824dff040492c7bf3052b269cedfd56c33893e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 17 Dec 2024 03:03:19 +0000
Subject: [PATCH 024/132] remove log

---
 vllm/v1/engine/async_llm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a05cf775abaac..1cac46bc7d418 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -259,8 +259,6 @@ async def generate(
                 # a delta text. This way we do "dynamic chunked streaming", such
                 # that the API client does not fall behind the EngineCor,
                 # which happens at high QPS otherwise.
-                if len(state.out_list) > 1:
-                    logger.info(f"{len(state.out_list)=}")
                 out = state.out_list[-1]
 
             except asyncio.TimeoutError:

From 26814f1d077ab7d3f0202f5dd44cf98d2a391d32 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 17 Dec 2024 03:14:45 +0000
Subject: [PATCH 025/132] updated

---
 vllm/v1/engine/async_llm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1cac46bc7d418..aec24c926ce1e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -309,7 +309,9 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
         """Process outputs by putting them into per-request AsyncStreams."""
 
         for request_output in request_outputs:
-            assert request_output.request_id in self.rid_to_state
+            if request_output.request_id not in self.rid_to_state:
+                raise RuntimeError(f"{request_output.request_id} "
+                                    "not in RequestStates")
 
             # Update the RequestState and alert generate() that there
             # is a RequestOutput ready to return to the user.

From 1205764b975c4bf438b93ec155d6d86d84460b25 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 17 Dec 2024 16:54:57 +0000
Subject: [PATCH 026/132] Stash

---
 vllm/entrypoints/openai/serving_completion.py | 10 ++++++---
 vllm/v1/engine/async_llm.py                   |  7 ++++---
 vllm/v1/engine/core.py                        |  2 +-
 vllm/v1/engine/core_client.py                 |  9 +++++---
 vllm/v1/engine/detokenizer.py                 | 21 ++++++++++++++++---
 vllm/v1/utils.py                              | 16 +++++++++++++-
 6 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 7a40c312f49ce..3b0bc2fa897c6 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -160,8 +160,10 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators, is_cancelled=raw_request.is_disconnected)
+        # result_generator = merge_async_iterators(
+        #     *generators, is_cancelled=raw_request.is_disconnected)
+        assert len(generators) == 1
+        result_generator = generators[0]
 
         model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
@@ -284,7 +286,9 @@ async def completion_stream_generator(
             include_usage, include_continuous_usage = False, False
 
         try:
-            async for prompt_idx, res in result_generator:
+            # async for prompt_idx, res in result_generator:
+            async for res in result_generator:
+                prompt_idx = 0
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b3e25597f4499..5f2f596efd1d9 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -184,9 +184,6 @@ async def add_request(
     ) -> RequestState:
         """Add new request to the AsyncLLM."""
 
-        # if self.detokenizer.is_request_active(request_id):
-        #     raise ValueError(f"Request {request_id} already exists.")
-
         # 1) Add to RequestState tracker. The "event" is used to manage
         # concurrency between generate() and output_handler().
         self.rid_to_state[request_id] = RequestState.new()
@@ -197,6 +194,8 @@ async def add_request(
             trace_headers, prompt_adapter_request, priority)
 
         # 3) Add the DetokenizerRequest to Detokenizer.
+        # TODO: sending these separately is a race condition. We should instead
+        # have the EngineCore do the "AddRequest" logic.
         await self.detokenizer.add_request_async(detokenizer_req)
 
         # 4) Add the EngineCoreRequest to EngineCore.
@@ -268,6 +267,8 @@ async def generate(
                 # that the API client does not fall behind the EngineCor,
                 # which happens at high QPS otherwise.
                 out = state.out_list[-1]
+                if len(state.out_list) > 10:
+                    logger.info(f"{len(state.out_list)=}")
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 30422891413c6..395efaf3bc017 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -33,7 +33,7 @@
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5
+LOGGING_TIME_S = 1
 
 
 class EngineCore:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index cfd3edab13877..01798dbee493e 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -130,8 +130,8 @@ class MPClient(EngineCoreClient):
     def __init__(
         self,
         *args,
-        output_path: str,
         asyncio_mode: bool,
+        output_path: Optional[str] = None,
         **kwargs,
     ):
         # Serialization setup.
@@ -140,9 +140,9 @@ def __init__(
 
         # ZMQ setup.
         if asyncio_mode:
-            self.ctx = zmq.asyncio.Context()
+            self.ctx = zmq.asyncio.Context(io_threads=2)
         else:
-            self.ctx = zmq.Context()  # type: ignore[attr-defined]
+            self.ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
 
         input_path = get_open_zmq_ipc_path()
         self.input_socket = make_zmq_socket(
@@ -151,6 +151,9 @@ def __init__(
             zmq.constants.PUSH,
         )
 
+        if output_path is None:
+            output_path = get_open_zmq_ipc_path()
+
         # Start EngineCore in background process.
         self.proc_handle: Optional[BackgroundProcHandle]
         self.proc_handle = EngineCoreProc.make_engine_core_process(
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index a1c384cd45967..67e3490618a32 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -13,7 +13,7 @@
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import get_open_zmq_ipc_path
+from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
 from vllm.v1.engine import (DetokenizerRequest, DetokenizerRequestType,
                             EngineCoreOutput, EngineCoreOutputs, 
                             BackgroundProcHandle,)
@@ -386,6 +386,10 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the Detokenizer."""
 
+        log_interval = 0
+        import time
+
+        last_log = time.perf_counter()
         try:
             # TODO: handle aborted due to client cancellation
             # TODO: pickle -> msgpack
@@ -420,7 +424,11 @@ def run_busy_loop(self):
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
                         outputs = self.step(engine_core_outputs)
                         msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL)
-                        output_socket.send_multipart((msg, ), copy=False)
+                        # now = time.perf_counter()
+                        # if now - last_log > 0.1:
+                        #     logger.info("Detok: Sending")
+                        #     last_log = now
+                        output_socket.send_multipart((msg, ), copy=False)                        
         
         except Exception as e:
             logger.error(e)
@@ -435,7 +443,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
         self.decoder = PickleEncoder()
         
         # ZMQ setup.
-        self.ctx = zmq.asyncio.Context()
+        self.ctx = zmq.asyncio.Context(io_threads=2)
 
         # Get input (DetokenizerRequest) to Detokenizer.
         input_path = get_open_zmq_ipc_path()
@@ -463,6 +471,13 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
             **kwargs,
         )
     
+    def shutdown(self):
+        self.proc_handle.proc.terminate()
+        self.proc_handle.proc.join(5)
+
+        if self.proc_handle.proc.is_alive():
+            kill_process_tree(self.proc_handle.proc.pid)
+
     async def add_request_async(self, request: DetokenizerRequest):
         """Send new DetokenizerRequest to Detokenizer."""
 
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index ecd98f246b064..7d748f6cfee6d 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -88,11 +88,25 @@ def make_zmq_socket(
     ) -> Union[zmq.Socket, zmq.asyncio.Socket]:
     """Make a ZMQ socket with the proper bind/connext semantics."""
 
+    import psutil
+    mem = psutil.virtual_memory()
+
     socket = ctx.socket(type)
+    
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    if total_mem > 32 and available_mem > 16:
+        buf_size = int(0.5 * 1024**3)
+    else:
+        buf_size = -1
 
     if type == zmq.constants.PULL:
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
         socket.connect(path)
     elif type == zmq.constants.PUSH:
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
         socket.bind(path)
     else:
         raise ValueError(f"Unknown Socket Type: {type}")
@@ -105,7 +119,7 @@ def zmq_socket_ctx(
         type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
     """Context manager for a ZMQ socket"""
 
-    ctx = zmq.Context()  # type: ignore[attr-defined]
+    ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
     try:
         yield make_zmq_socket(ctx, path, type)
 

From 73da178500642e25a503d5c75fa7c7913407dbe5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 17 Dec 2024 22:14:46 +0000
Subject: [PATCH 027/132] stash

---
 benchmarks/benchmark_throughput.py | 21 ++++++++++++-------
 vllm/v1/engine/async_llm.py        | 33 ++++++++++++++++++++++++------
 vllm/v1/engine/core.py             |  3 +++
 vllm/v1/engine/detokenizer.py      | 15 +++++++++++---
 4 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1e5967bd9bf8b..0926cec29a907 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -170,7 +170,7 @@ def run_vllm(
         end = time.perf_counter()
     return end - start
 
-
+import asyncio
 async def run_vllm_async(
     requests: List[SampleRequest],
     n: int,
@@ -198,17 +198,23 @@ async def run_vllm_async(
                     max_tokens=request.expected_output_len,
                 ))
 
-        generators = []
+        tasks = []
         start = time.perf_counter()
         for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
             generator = llm.generate(prompt, sp, request_id=f"test{i}")
-            generators.append(generator)
-        all_gens = merge_async_iterators(*generators)
-        async for i, res in all_gens:
-            pass
+            tasks.append(run(generator))
+        # all_gens = merge_async_iterators(*generators)
+        # async for i, res in all_gens:
+        #     pass
+
+        await asyncio.gather(*tasks)
+
         end = time.perf_counter()
         return end - start
 
+async def run(generator):
+    async for res in generator:
+        pass
 
 def run_hf(
     requests: List[SampleRequest],
@@ -331,7 +337,8 @@ def main(args: argparse.Namespace):
                          for request in requests)
     if args.backend == "vllm":
         if args.async_engine:
-            elapsed_time = uvloop.run(
+            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+            elapsed_time = asyncio.run(
                 run_vllm_async(
                     requests,
                     args.n,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5f2f596efd1d9..def445c155f83 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -110,7 +110,8 @@ def __init__(
             usage_context=usage_context,
         )
 
-        self.output_handler: Optional[asyncio.Task] = None
+        # self.output_handler: Optional[asyncio.Task] = None
+        self.to_create_loop = True
 
     def __del__(self):
         self.shutdown()
@@ -239,9 +240,15 @@ async def generate(
         # We start the output_handler on the first call to generate() so that
         # we can call __init__ before the event loop starts, which enables us
         # to handle startup failure gracefully in the OpenAI server.
-        if self.output_handler is None:
-            self.output_handler = asyncio.create_task(
-                self._run_output_handler())
+        # if self.output_handler is None:
+        if self.to_create_loop:
+            loop = asyncio.get_event_loop()
+            print(f"{loop=}")
+            loop.create_task(self._run_output_handler())
+            self.to_create_loop = False
+            # self.output_handler = asyncio.create_task(
+            #     self._run_output_handler())
+            
 
         state = await self.add_request(
             request_id,
@@ -257,6 +264,7 @@ async def generate(
             try:
                 await asyncio.wait_for(state.event.wait(),
                                        timeout=WAITING_TIMEOUT_MS)
+                logger.info(f"{request_id} woke up.")
 
                 # NOTE(rob): out_list can have more than one item. However,
                 # in the streaming case, we use RequestOutputKind.CUMULATIVE,
@@ -272,7 +280,7 @@ async def generate(
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
-                logger.debug("Timeout waiting for %s", request_id)
+                logger.info("Timeout waiting for %s", request_id)
                 continue
 
             state.out_list = []
@@ -330,23 +338,36 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+        # idx = 0
 
+        # import pyinstrument
+
+        # prof = pyinstrument.Profiler()
+        # prof.start()
+        # i = 0
         try:
             while True:
                 # 1) Pull outputs from the Detokenizer.
                 request_outputs, reqs_to_abort = (
                     await self.detokenizer.get_output_async())
+                logger.info("AsyncLLM")
+                # logger.info(f"RECV: {idx}")
+                # idx+=1
 
                 # 2) Put the RequestOutputs into the per-request AsyncStreams.
                 self._process_request_outputs(request_outputs)
 
                 # 3) Abort any requests that finished due to stop strings.
-                await self.engine_core.abort_requests_async(reqs_to_abort)
+                # await self.engine_core.abort_requests_async(reqs_to_abort)
 
                 # 4) Abort any requests due to client cancellations.
                 # TODO: send back to detokenizer if this fails.
                 # await self._process_cancellations()
 
+        # except KeyboardInterrupt:
+        #     prof.stop()
+        #     prof.write_html("output_handler.prof")
+
         except Exception as e:
             logger.error(e)
             raise e
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 395efaf3bc017..e36af368568e0 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -334,10 +334,13 @@ def process_output_socket(self, output_path: str):
         buffer = bytearray()
 
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
+            idx = 0
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
                 encoder.encode_into(outputs, buffer)
                 # msg = (DetokenizerRequestType.OUT.value, buffer)
                 msg = (buffer, )
+                # logger.info(f"SEND: {idx}: {len(engine_core_outputs)}")
+                # idx += 1
                 socket.send_multipart(msg, copy=False)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 67e3490618a32..f989c12b89552 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -409,6 +409,7 @@ def run_busy_loop(self):
                 poller.register(engine_core_outputs_socket, zmq.POLLIN)
                 poller.register(input_socket, zmq.POLLIN)
 
+                # idx = 0
                 while True:
                     socks = dict(poller.poll())
 
@@ -428,12 +429,16 @@ def run_busy_loop(self):
                         # if now - last_log > 0.1:
                         #     logger.info("Detok: Sending")
                         #     last_log = now
+                        # logger.info(f"SEND: {idx}: {len(engine_core_outputs)}")
+                        # idx += 1
                         output_socket.send_multipart((msg, ), copy=False)                        
         
         except Exception as e:
             logger.error(e)
             raise e
-        
+
+import time
+
 class DetokenizerClient:
     
     def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
@@ -484,9 +489,13 @@ async def add_request_async(self, request: DetokenizerRequest):
         msg = (self.encoder.encode(request), )
         await self.input_socket.send_multipart(msg, copy=False)
 
-
     async def get_output_async(self) -> Tuple[List[RequestOutput], List[str]]:
         """Get RequestOutputs, RequestsToAbort from Detokenizer."""
 
         (frame, ) = await self.output_socket.recv_multipart(copy=False)
-        return self.decoder.decode(frame.buffer)
+        start = time.perf_counter()
+        out = self.decoder.decode(frame.buffer)
+        end = time.perf_counter()
+        if end - start > 0.1:
+            logger.info(f"{end - start}")
+        return out

From 9830fbe1a7286744b040ee8d49359075c0eab317 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 18 Dec 2024 15:04:36 +0000
Subject: [PATCH 028/132] stash

---
 vllm/v1/engine/__init__.py    | 24 +++++++++++++
 vllm/v1/engine/async_llm.py   | 37 +++++++-------------
 vllm/v1/engine/detokenizer.py | 65 ++++++++++++++++++-----------------
 3 files changed, 70 insertions(+), 56 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index ee6b90b1bab1f..089ad1052e5f2 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -35,6 +35,30 @@ class DetokenizerRequest(
     include_stop_str_in_output: bool
 
 
+class DetokenizerOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    request_id: str
+    text: str
+    finished: bool
+
+
+class DetokenizerOutputs(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    #NOTE(Nick): We could consider ways to make this more compact,
+    # e.g. columnwise layout and using an int enum for finish/stop reason
+
+    # [num_reqs]
+    outputs: List[DetokenizerOutput]
+
+
 @dataclass
 class EngineCoreRequest:
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index def445c155f83..8b18f34b5d85c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -243,12 +243,8 @@ async def generate(
         # if self.output_handler is None:
         if self.to_create_loop:
             loop = asyncio.get_event_loop()
-            print(f"{loop=}")
             loop.create_task(self._run_output_handler())
             self.to_create_loop = False
-            # self.output_handler = asyncio.create_task(
-            #     self._run_output_handler())
-            
 
         state = await self.add_request(
             request_id,
@@ -264,7 +260,7 @@ async def generate(
             try:
                 await asyncio.wait_for(state.event.wait(),
                                        timeout=WAITING_TIMEOUT_MS)
-                logger.info(f"{request_id} woke up.")
+                # logger.info(f"{request_id} woke up.")
 
                 # NOTE(rob): out_list can have more than one item. However,
                 # in the streaming case, we use RequestOutputKind.CUMULATIVE,
@@ -322,20 +318,6 @@ async def _process_cancellations(self) -> None:
         # Remove from EngineCore.
         await self.engine_core.abort_requests_async(reqs_to_abort)
 
-    def _process_request_outputs(self, request_outputs: List[RequestOutput]):
-        """Process outputs by putting them into per-request AsyncStreams."""
-
-        for request_output in request_outputs:
-            if request_output.request_id not in self.rid_to_state:
-                raise RuntimeError(f"{request_output.request_id} "
-                                    "not in RequestStates")
-
-            # Update the RequestState and alert generate() that there
-            # is a RequestOutput ready to return to the user.
-            state = self.rid_to_state[request_output.request_id]
-            state.out_list.append(request_output)
-            state.event.set()
-
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
         # idx = 0
@@ -348,14 +330,21 @@ async def _run_output_handler(self):
         try:
             while True:
                 # 1) Pull outputs from the Detokenizer.
-                request_outputs, reqs_to_abort = (
-                    await self.detokenizer.get_output_async())
-                logger.info("AsyncLLM")
+                detokenizer_outputs = (
+                    await self.detokenizer.get_output_async()).outputs
                 # logger.info(f"RECV: {idx}")
                 # idx+=1
 
-                # 2) Put the RequestOutputs into the per-request AsyncStreams.
-                self._process_request_outputs(request_outputs)
+                for out in detokenizer_outputs:
+                    if out.request_id not in self.rid_to_state:
+                        raise RuntimeError(f"{out.request_id} "
+                                            "not in RequestStates")
+
+                    # Update the RequestState and alert generate() that there
+                    # is a RequestOutput ready to return to the user.
+                    state = self.rid_to_state[out.request_id]
+                    state.out_list.append(out)
+                    state.event.set()
 
                 # 3) Abort any requests that finished due to stop strings.
                 # await self.engine_core.abort_requests_async(reqs_to_abort)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index f989c12b89552..76fa3a595f72b 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -14,7 +14,8 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
-from vllm.v1.engine import (DetokenizerRequest, DetokenizerRequestType,
+from vllm.v1.engine import (DetokenizerRequest, DetokenizerOutputs,
+                            DetokenizerOutput,
                             EngineCoreOutput, EngineCoreOutputs, 
                             BackgroundProcHandle,)
 from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, 
@@ -251,10 +252,11 @@ def add_request(
 
     def step(
         self, encore_core_outputs: List[EngineCoreOutput]
-    ) -> Tuple[List[RequestOutput], List[str]]:
+    ) -> DetokenizerOutputs:
         """Update state and request the RequestOutputs to the LLMEngine."""
 
-        request_outputs: List[RequestOutput] = []
+        # request_outputs: List[RequestOutput] = []
+        detokenizer_outputs = DetokenizerOutputs(outputs=[])
         requests_to_abort: List[str] = []
         for engine_core_output in encore_core_outputs:
             request_id = engine_core_output.request_id
@@ -269,22 +271,30 @@ def step(
                 finish_reason=engine_core_output.finish_reason,
                 stop_reason=engine_core_output.stop_reason,
             )
-
+            
             if request_output is not None:
-                # Add to RequestOutputs list.
-                request_outputs.append(request_output)
-
-                # Free completed requests.
-                if request_output.finished:
-                    self.request_states.pop(request_id)
-                    # If Request finished but EngineCore not finished,
-                    # this was caused by a stop string + we need to send
-                    # an abort signal to the EngineCore.
-                    if not engine_core_output.finished:
-                        requests_to_abort.append(request_id)
+                detokenizer_outputs.outputs.append(
+                    DetokenizerOutput(
+                        request_id=request_id,
+                        text=request_output.outputs[0].text,
+                        finished=request_output.finished,
+                    )
+                )   
+                # # Add to RequestOutputs list.
+                # request_outputs.append(request_output)
+
+                # # Free completed requests.
+                # if request_output.finished:
+                #     self.request_states.pop(request_id)
+                #     # If Request finished but EngineCore not finished,
+                #     # this was caused by a stop string + we need to send
+                #     # an abort signal to the EngineCore.
+                #     if not engine_core_output.finished:
+                #         requests_to_abort.append(request_id)
 
         # Return to EngineClient.
-        return request_outputs, requests_to_abort
+        # return request_outputs, requests_to_abort
+        return detokenizer_outputs, []
 
 class DetokenizerProc(Detokenizer):
     """ZMQ-wrapper for running Detokenizer in background process."""
@@ -397,6 +407,7 @@ def run_busy_loop(self):
 
             decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest)
             decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs)
+            encoder = msgspec.msgpack.Encoder()
 
             with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, 
                 zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket,
@@ -423,14 +434,8 @@ def run_busy_loop(self):
                     if engine_core_outputs_socket in socks:
                         (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False)
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
-                        outputs = self.step(engine_core_outputs)
-                        msg = pickle.dumps(outputs, protocol=pickle.HIGHEST_PROTOCOL)
-                        # now = time.perf_counter()
-                        # if now - last_log > 0.1:
-                        #     logger.info("Detok: Sending")
-                        #     last_log = now
-                        # logger.info(f"SEND: {idx}: {len(engine_core_outputs)}")
-                        # idx += 1
+                        detokenizer_outputs, _ = self.step(engine_core_outputs)
+                        msg = encoder.encode(detokenizer_outputs)
                         output_socket.send_multipart((msg, ), copy=False)                        
         
         except Exception as e:
@@ -445,7 +450,8 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
 
         # Serialization setup.
         self.encoder = msgspec.msgpack.Encoder()
-        self.decoder = PickleEncoder()
+        # self.decoder = PickleEncoder()
+        self.decoder = msgspec.msgpack.Decoder(DetokenizerOutputs)
         
         # ZMQ setup.
         self.ctx = zmq.asyncio.Context(io_threads=2)
@@ -489,13 +495,8 @@ async def add_request_async(self, request: DetokenizerRequest):
         msg = (self.encoder.encode(request), )
         await self.input_socket.send_multipart(msg, copy=False)
 
-    async def get_output_async(self) -> Tuple[List[RequestOutput], List[str]]:
+    async def get_output_async(self) -> DetokenizerOutputs:
         """Get RequestOutputs, RequestsToAbort from Detokenizer."""
 
         (frame, ) = await self.output_socket.recv_multipart(copy=False)
-        start = time.perf_counter()
-        out = self.decoder.decode(frame.buffer)
-        end = time.perf_counter()
-        if end - start > 0.1:
-            logger.info(f"{end - start}")
-        return out
+        return self.decoder.decode(frame.buffer)

From 661ee446710cdfcac6c713c2f08efeb500f4af23 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 18 Dec 2024 15:09:33 +0000
Subject: [PATCH 029/132] stash

---
 vllm/v1/engine/async_llm.py            | 12 ------------
 vllm/v1/engine/core.py                 |  6 +++---
 vllm/v1/engine/core_client.py          |  2 +-
 vllm/v1/engine/detokenizer.py          | 12 ++++++------
 vllm/v1/executor/multiproc_executor.py |  4 ++--
 vllm/v1/utils.py                       |  6 +++---
 6 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 8b18f34b5d85c..403562fc4f43c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -320,20 +320,12 @@ async def _process_cancellations(self) -> None:
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
-        # idx = 0
 
-        # import pyinstrument
-
-        # prof = pyinstrument.Profiler()
-        # prof.start()
-        # i = 0
         try:
             while True:
                 # 1) Pull outputs from the Detokenizer.
                 detokenizer_outputs = (
                     await self.detokenizer.get_output_async()).outputs
-                # logger.info(f"RECV: {idx}")
-                # idx+=1
 
                 for out in detokenizer_outputs:
                     if out.request_id not in self.rid_to_state:
@@ -353,10 +345,6 @@ async def _run_output_handler(self):
                 # TODO: send back to detokenizer if this fails.
                 # await self._process_cancellations()
 
-        # except KeyboardInterrupt:
-        #     prof.stop()
-        #     prof.write_html("output_handler.prof")
-
         except Exception as e:
             logger.error(e)
             raise e
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e36af368568e0..2eff8a8adb230 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -166,7 +166,7 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
+        with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket:
             ready_socket.send_string(EngineCoreProc.READY_STR)
 
     @staticmethod
@@ -305,7 +305,7 @@ def process_input_socket(self, input_path: str):
         decoder_add_req = PickleEncoder()
         decoder_abort_req = PickleEncoder()
 
-        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(input_path, zmq.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
@@ -333,7 +333,7 @@ def process_output_socket(self, output_path: str):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
+        with zmq_socket_ctx(output_path, zmq.PUSH) as socket:
             idx = 0
             while True:
                 engine_core_outputs = self.output_queue.get()
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 01798dbee493e..4c0745060bc02 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -148,7 +148,7 @@ def __init__(
         self.input_socket = make_zmq_socket(
             self.ctx,
             input_path,
-            zmq.constants.PUSH,
+            zmq.PUSH,
         )
 
         if output_path is None:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 76fa3a595f72b..43930cbfb8eab 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -317,7 +317,7 @@ def __init__(
         self.output_path = output_path
 
         # Send readiness signal.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
+        with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket:
             ready_socket.send_string(DetokenizerProc.READY_STR)
 
 
@@ -409,9 +409,9 @@ def run_busy_loop(self):
             decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs)
             encoder = msgspec.msgpack.Encoder()
 
-            with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.constants.PULL) as engine_core_outputs_socket, 
-                zmq_socket_ctx(self.input_path, zmq.constants.PULL) as input_socket,
-                zmq_socket_ctx(self.output_path, zmq.constants.PUSH) as output_socket):
+            with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as engine_core_outputs_socket, 
+                  zmq_socket_ctx(self.input_path, zmq.PULL) as input_socket,
+                  zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket):
 
                 # TODO: avoid poll by having both EngineCore
                 # and AsyncLLM send to the same socket (unclear why this 
@@ -461,7 +461,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
         self.input_socket = make_zmq_socket(
             self.ctx,
             input_path,
-            zmq.constants.PUSH,
+            zmq.PUSH,
         )
 
         # Get output (RequestOutput) from Detokenizer.
@@ -469,7 +469,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
         self.output_socket = make_zmq_socket(
             self.ctx,
             output_path,
-            zmq.constants.PULL,
+            zmq.PULL,
         )
 
         # Start Detokenizer in background process.
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index aa246f778f8f3..12fe559341931 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -254,7 +254,7 @@ def __init__(
         worker_response_mq_handle = self.worker_response_mq.export_handle()
 
         # Send Readiness signal to EngineCore process.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
+        with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket:
             payload = pickle.dumps(worker_response_mq_handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
             ready_socket.send_string(WorkerProc.READY_STR)
@@ -356,7 +356,7 @@ def wait_for_startup(
         ready_path: str,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(ready_path, zmq.PULL) as socket:
 
             # Wait for Worker to send READY.
             while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 7d748f6cfee6d..d438985736ba5 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -100,11 +100,11 @@ def make_zmq_socket(
     else:
         buf_size = -1
 
-    if type == zmq.constants.PULL:
+    if type == zmq.PULL:
         socket.setsockopt(zmq.RCVHWM, 0)
         socket.setsockopt(zmq.RCVBUF, buf_size)
         socket.connect(path)
-    elif type == zmq.constants.PUSH:
+    elif type == zmq.PUSH:
         socket.setsockopt(zmq.SNDHWM, 0)
         socket.setsockopt(zmq.SNDBUF, buf_size)
         socket.bind(path)
@@ -138,7 +138,7 @@ def wait_for_startup(
 ) -> None:
     """Wait until a background process is ready."""
 
-    with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
+    with zmq_socket_ctx(ready_path, zmq.PULL) as socket:
         try:
             while socket.poll(timeout=timeout_ms) == 0:
                 logger.debug("Waiting for background proc to startup.")

From 6f1252547189b78e7c700ea66345642fcf1ac6b3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 18 Dec 2024 15:13:25 +0000
Subject: [PATCH 030/132] stash

---
 vllm/v1/engine/async_llm.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 403562fc4f43c..5c4c7f49de4de 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -25,9 +25,6 @@
 
 logger = init_logger(__name__)
 
-WAITING_TIMEOUT_MS = 5
-
-
 @dataclass
 class RequestState:
     """
@@ -259,7 +256,7 @@ async def generate(
         while True:
             try:
                 await asyncio.wait_for(state.event.wait(),
-                                       timeout=WAITING_TIMEOUT_MS)
+                                       timeout=4)
                 # logger.info(f"{request_id} woke up.")
 
                 # NOTE(rob): out_list can have more than one item. However,
@@ -271,12 +268,12 @@ async def generate(
                 # that the API client does not fall behind the EngineCor,
                 # which happens at high QPS otherwise.
                 out = state.out_list[-1]
-                if len(state.out_list) > 10:
+                if len(state.out_list) > 1:
                     logger.info(f"{len(state.out_list)=}")
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
-                logger.info("Timeout waiting for %s", request_id)
+                logger.debug("Timeout waiting for %s", request_id)
                 continue
 
             state.out_list = []

From fd91f4b085ef1fa89ed8505d824c4fc042983189 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 18 Dec 2024 16:14:22 +0000
Subject: [PATCH 031/132] stash

---
 vllm/v1/core/scheduler.py     |  2 ++
 vllm/v1/engine/__init__.py    |  3 +++
 vllm/v1/engine/async_llm.py   |  8 +++---
 vllm/v1/engine/detokenizer.py | 49 ++++++++++++++++++++++++++++++++---
 4 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index b44d72afae94a..23dc4ef298fea 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -428,6 +428,8 @@ def update_from_output(
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
                     request_id=req_id,
+                    prompt=request.prompt,
+                    prompt_token_ids=request.prompt_token_ids,
                     new_token_ids=request.output_token_ids[-num_new_tokens:],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 089ad1052e5f2..0de28b0b76f0c 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -87,10 +87,13 @@ class EngineCoreOutput(
         gc=False):  # type: ignore[call-arg]
 
     request_id: str
+    prompt_token_ids: List[int]
+    prompt: str
     new_token_ids: List[int]
     finished: bool
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
+    
 
 
 class EngineCoreOutputs(
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5c4c7f49de4de..20eca6ac75a8b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -187,14 +187,14 @@ async def add_request(
         self.rid_to_state[request_id] = RequestState.new()
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
-        detokenizer_req, engine_core_req = self.processor.process_inputs(
+        _, engine_core_req = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
         # 3) Add the DetokenizerRequest to Detokenizer.
         # TODO: sending these separately is a race condition. We should instead
         # have the EngineCore do the "AddRequest" logic.
-        await self.detokenizer.add_request_async(detokenizer_req)
+        # await self.detokenizer.add_request_async(detokenizer_req)
 
         # 4) Add the EngineCoreRequest to EngineCore.
         await self.engine_core.add_request_async(engine_core_req)
@@ -268,8 +268,8 @@ async def generate(
                 # that the API client does not fall behind the EngineCor,
                 # which happens at high QPS otherwise.
                 out = state.out_list[-1]
-                if len(state.out_list) > 1:
-                    logger.info(f"{len(state.out_list)=}")
+                # if len(state.out_list) > 1:
+                #     logger.info(f"{len(state.out_list)=}")
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 43930cbfb8eab..e9a5a00d696b1 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -106,6 +106,37 @@ def from_new_request(
             stop_buffer_length=stop_buffer_length,
         )
 
+    @classmethod
+    def from_eco(
+        cls,
+        tokenizer: AnyTokenizer,
+        eco: EngineCoreOutput,
+    ):
+        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
+            tokenizer=tokenizer,
+            prompt_ids=eco.prompt_token_ids,
+            skip_special_tokens=True,
+        )
+
+        return cls(
+            output_text="",
+            tokens=tokens,
+            token_ids=eco.prompt_token_ids,
+            stop=[],
+            include_stop_str_in_output=False,
+            prefix_offset=prefix_offset,
+            read_offset=read_offset,
+            skip_special_tokens=True,
+            spaces_between_special_tokens=True,
+            output_kind=RequestOutputKind.CUMULATIVE,
+            request_id=eco.request_id,
+            prompt=eco.prompt,
+            prompt_token_ids=eco.prompt_token_ids,
+            tokenizer=tokenizer,
+            stop_buffer_length=0,
+        )
+        
+
     def add_tokens(
         self,
         new_token_ids: List[int],
@@ -158,8 +189,6 @@ def add_tokens(
                 finish_reason = "stop"  # TODO: use constant
                 stop_reason = stop_str
 
-        # TODO: handle stop_token_ids here too?
-
         # 3) Update the RequestOutput object with the new text.
         finished = bool(finish_reason)
         if self.output_kind == RequestOutputKind.FINAL_ONLY \
@@ -250,16 +279,30 @@ def add_request(
             self.tokenizer, request)
         self.request_states[request.request_id] = request_state
 
+    def add_request_eco(
+        self,
+        eco: EngineCoreOutput,
+    ):
+        request_state = IncrementalDetokenizer.from_eco(
+            self.tokenizer, eco)
+        self.request_states[eco.request_id] = request_state
+        
+        
     def step(
         self, encore_core_outputs: List[EngineCoreOutput]
     ) -> DetokenizerOutputs:
         """Update state and request the RequestOutputs to the LLMEngine."""
 
         # request_outputs: List[RequestOutput] = []
+        # requests_to_abort: List[str] = []
         detokenizer_outputs = DetokenizerOutputs(outputs=[])
-        requests_to_abort: List[str] = []
+
         for engine_core_output in encore_core_outputs:
             request_id = engine_core_output.request_id
+
+            if request_id not in self.request_states:
+                self.add_request_eco(engine_core_output)
+
             detokenizer = self.request_states.get(request_id)
             if detokenizer is None:
                 # Ignore output for already-aborted request.

From 6c99a4f33e2adc9d3f07fafaff2de7c8a10d93ce Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 18 Dec 2024 17:10:04 +0000
Subject: [PATCH 032/132] stash

---
 vllm/outputs.py               | 25 +++++++++
 vllm/v1/engine/__init__.py    |  1 +
 vllm/v1/engine/async_llm.py   | 95 +++++++++++++++++++++--------------
 vllm/v1/engine/core_client.py |  2 +-
 vllm/v1/engine/detokenizer.py |  1 +
 5 files changed, 84 insertions(+), 40 deletions(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index 2ecdf74ee59b3..9a4b4353deb1d 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -12,6 +12,7 @@
 from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
                            SequenceGroup, SequenceGroupBase, SequenceStatus)
+from vllm.v1.engine import DetokenizerOutput
 
 
 @dataclass
@@ -132,6 +133,30 @@ def __init__(
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
         self.num_cached_tokens = num_cached_tokens
 
+    @classmethod
+    def from_detok(
+        cls,
+        prompt: str,
+        prompt_token_ids: List[int],
+        detok_output: DetokenizerOutput,
+    ):
+        completion_output = CompletionOutput(
+            index=0,
+            text=detok_output.text,
+            token_ids=detok_output.token_ids,
+            cumulative_logprob=None,
+            logprobs=None,  # TODO
+        )
+
+        return RequestOutput(
+            request_id=detok_output.request_id,
+            prompt=prompt,
+            prompt_token_ids=prompt_token_ids,
+            prompt_logprobs=None,  # TODO
+            outputs=[completion_output],
+            finished=detok_output.finished,
+        )
+
     @classmethod
     def new(
         cls,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 0de28b0b76f0c..860662602160a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -43,6 +43,7 @@ class DetokenizerOutput(
 
     request_id: str
     text: str
+    token_ids: List[int]
     finished: bool
 
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 20eca6ac75a8b..8148d28de1bf1 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -25,6 +25,9 @@
 
 logger = init_logger(__name__)
 
+import uvloop
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
 @dataclass
 class RequestState:
     """
@@ -39,12 +42,14 @@ class RequestState:
     out_list back to the caller generate()
     """
 
+    prompt: str
+    prompt_token_ids: List[int]
     event: asyncio.Event
     out_list: List[RequestOutput]
 
     @classmethod
-    def new(cls) -> "RequestState":
-        return cls(asyncio.Event(), [])
+    def new(cls, prompt, prompt_token_ids) -> "RequestState":
+        return cls(prompt, prompt_token_ids, asyncio.Event(), [])
 
 
 class AsyncLLM(EngineClient):
@@ -63,6 +68,7 @@ def __init__(
     ) -> None:
         assert start_engine_loop
 
+        self.warned = False
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers = stat_loggers
@@ -182,14 +188,18 @@ async def add_request(
     ) -> RequestState:
         """Add new request to the AsyncLLM."""
 
-        # 1) Add to RequestState tracker. The "event" is used to manage
-        # concurrency between generate() and output_handler().
-        self.rid_to_state[request_id] = RequestState.new()
+        
+        
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         _, engine_core_req = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
+        
+        # 1) Add to RequestState tracker. The "event" is used to manage
+        # concurrency between generate() and output_handler().
+        self.rid_to_state[request_id] = RequestState.new(prompt,
+                                                         engine_core_req.prompt_token_ids)
 
         # 3) Add the DetokenizerRequest to Detokenizer.
         # TODO: sending these separately is a race condition. We should instead
@@ -231,8 +241,8 @@ async def generate(
         """
 
         # DELTA streaming is not supported due to dynamic chunking.
-        assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE
-                or sampling_params.output_kind == RequestOutputKind.FINAL_ONLY)
+        assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE or
+                sampling_params.output_kind == RequestOutputKind.FINAL_ONLY)
 
         # We start the output_handler on the first call to generate() so that
         # we can call __init__ before the event loop starts, which enables us
@@ -255,9 +265,7 @@ async def generate(
 
         while True:
             try:
-                await asyncio.wait_for(state.event.wait(),
-                                       timeout=4)
-                # logger.info(f"{request_id} woke up.")
+                await asyncio.wait_for(state.event.wait(), timeout=4)
 
                 # NOTE(rob): out_list can have more than one item. However,
                 # in the streaming case, we use RequestOutputKind.CUMULATIVE,
@@ -268,8 +276,9 @@ async def generate(
                 # that the API client does not fall behind the EngineCor,
                 # which happens at high QPS otherwise.
                 out = state.out_list[-1]
-                # if len(state.out_list) > 1:
-                #     logger.info(f"{len(state.out_list)=}")
+                if len(state.out_list) > 2 and not self.warned:
+                    logger.info(f"{len(state.out_list)=}")
+                    self.warned = True
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
@@ -285,35 +294,35 @@ async def generate(
             state.event.clear()
             yield out
 
-    async def _process_cancellations(self) -> None:
-        """
-        Process requests cancelled from user disconnecting.
+    # async def _process_cancellations(self) -> None:
+    #     """
+    #     Process requests cancelled from user disconnecting.
 
-        When a client disconnects, AsyncStream._cancel() is called.
-        We passed a callback to AsyncStream(), which appends to 
-        self.client_aborted_requests.
+    #     When a client disconnects, AsyncStream._cancel() is called.
+    #     We passed a callback to AsyncStream(), which appends to 
+    #     self.client_aborted_requests.
 
-        As a result, if any requests are canceled from the user side
-        the request_id will show up in self.client_aborted_requests.
-        """
+    #     As a result, if any requests are canceled from the user side
+    #     the request_id will show up in self.client_aborted_requests.
+    #     """
 
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        if not self.client_aborted_requests:
-            return
-        reqs_to_abort = self.client_aborted_requests.copy()
-        self.client_aborted_requests.clear()
+    #     # Avoid streams having circular ref to parent AsyncLLM object.
+    #     if not self.client_aborted_requests:
+    #         return
+    #     reqs_to_abort = self.client_aborted_requests.copy()
+    #     self.client_aborted_requests.clear()
 
-        # Remove from Detokenizer.
-        self.detokenizer.abort_requests(reqs_to_abort)
+    #     # Remove from Detokenizer.
+    #     self.detokenizer.abort_requests(reqs_to_abort)
 
-        # Remove from RequestStreams.
-        for request_id in reqs_to_abort:
-            if self.log_requests:
-                logger.info("User-cancelled request %s.", request_id)
-            self._finish_stream(request_id)
+    #     # Remove from RequestStreams.
+    #     for request_id in reqs_to_abort:
+    #         if self.log_requests:
+    #             logger.info("User-cancelled request %s.", request_id)
+    #         self._finish_stream(request_id)
 
-        # Remove from EngineCore.
-        await self.engine_core.abort_requests_async(reqs_to_abort)
+    #     # Remove from EngineCore.
+    #     await self.engine_core.abort_requests_async(reqs_to_abort)
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
@@ -324,14 +333,22 @@ async def _run_output_handler(self):
                 detokenizer_outputs = (
                     await self.detokenizer.get_output_async()).outputs
 
-                for out in detokenizer_outputs:
-                    if out.request_id not in self.rid_to_state:
-                        raise RuntimeError(f"{out.request_id} "
+                for detok_out in detokenizer_outputs:
+                    if detok_out.request_id not in self.rid_to_state:
+                        raise RuntimeError(f"{detok_out.request_id} "
                                             "not in RequestStates")
 
+                    state = self.rid_to_state[detok_out.request_id]
+
+                    out = RequestOutput.from_detok(
+                        state.prompt,
+                        state.prompt_token_ids,
+                        detok_out,
+                    )
+
                     # Update the RequestState and alert generate() that there
                     # is a RequestOutput ready to return to the user.
-                    state = self.rid_to_state[out.request_id]
+                    
                     state.out_list.append(out)
                     state.event.set()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4c0745060bc02..4327e48b86ef1 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -232,7 +232,7 @@ async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
 
         msg = (request_type.value, self.encoder.encode(request))
-        await self.input_socket.send_multipart(msg, copy=False)
+        await self.input_socket.send_multipart(msg, copy=False, flag=zmq.NOBLOCK)
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         await self._send_input(EngineCoreRequestType.ADD, request)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index e9a5a00d696b1..d56f9646fdda3 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -319,6 +319,7 @@ def step(
                 detokenizer_outputs.outputs.append(
                     DetokenizerOutput(
                         request_id=request_id,
+                        token_ids=request_output.outputs[0].token_ids,
                         text=request_output.outputs[0].text,
                         finished=request_output.finished,
                     )

From dfa452658111a0341d8b224923b1cc8b6d811016 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 19 Dec 2024 00:09:53 +0000
Subject: [PATCH 033/132] stahs

---
 vllm/v1/engine/async_llm.py   | 16 +++++++++-------
 vllm/v1/engine/core.py        |  5 +++++
 vllm/v1/engine/core_client.py |  2 +-
 vllm/v1/engine/detokenizer.py | 20 +++++++++++++-------
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 8148d28de1bf1..a0ed047f630f1 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -188,9 +188,6 @@ async def add_request(
     ) -> RequestState:
         """Add new request to the AsyncLLM."""
 
-        
-        
-
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         _, engine_core_req = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
@@ -275,15 +272,16 @@ async def generate(
                 # a delta text. This way we do "dynamic chunked streaming", such
                 # that the API client does not fall behind the EngineCor,
                 # which happens at high QPS otherwise.
-                out = state.out_list[-1]
-                if len(state.out_list) > 2 and not self.warned:
-                    logger.info(f"{len(state.out_list)=}")
-                    self.warned = True
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
                 logger.debug("Timeout waiting for %s", request_id)
                 continue
+                
+            out = state.out_list[-1]
+            if len(state.out_list) > 2:
+                logger.info(f"{len(state.out_list)=}")
+                self.warned = True
 
             state.out_list = []
             if out.finished:
@@ -326,9 +324,13 @@ async def generate(
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+        epoch = 0
 
         try:
             while True:
+                logger.info(f"EPOCH: {epoch}")
+                epoch += 1
+
                 # 1) Pull outputs from the Detokenizer.
                 detokenizer_outputs = (
                     await self.detokenizer.get_output_async()).outputs
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 2eff8a8adb230..8616141c4cec7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -243,9 +243,14 @@ def signal_handler(signum, frame):
 
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
+        
+        epoch = 0
 
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
+            logger.info(f"EPOCH: {epoch}")
+            epoch += 1
+
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4327e48b86ef1..4c0745060bc02 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -232,7 +232,7 @@ async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
 
         msg = (request_type.value, self.encoder.encode(request))
-        await self.input_socket.send_multipart(msg, copy=False, flag=zmq.NOBLOCK)
+        await self.input_socket.send_multipart(msg, copy=False)
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         await self._send_input(EngineCoreRequestType.ADD, request)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index d56f9646fdda3..141be5e81f589 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -231,7 +231,7 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
             return self.output_text[last_offset:length]
         return ""
 
-
+import time
 class Detokenizer:
 
     def __init__(self,
@@ -456,6 +456,8 @@ def run_busy_loop(self):
             with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as engine_core_outputs_socket, 
                   zmq_socket_ctx(self.input_path, zmq.PULL) as input_socket,
                   zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket):
+                
+                epoch = 0
 
                 # TODO: avoid poll by having both EngineCore
                 # and AsyncLLM send to the same socket (unclear why this 
@@ -466,6 +468,9 @@ def run_busy_loop(self):
 
                 # idx = 0
                 while True:
+                    logger.info(f"EPOCH: {epoch}")
+                    epoch += 1
+
                     socks = dict(poller.poll())
 
                     # Handle NewRequest.
@@ -480,14 +485,13 @@ def run_busy_loop(self):
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
                         detokenizer_outputs, _ = self.step(engine_core_outputs)
                         msg = encoder.encode(detokenizer_outputs)
-                        output_socket.send_multipart((msg, ), copy=False)                        
+                        # output_socket.send_multipart((msg, ), copy=False)
+                        output_socket.send(msg)
         
         except Exception as e:
             logger.error(e)
             raise e
 
-import time
-
 class DetokenizerClient:
     
     def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
@@ -498,7 +502,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
         self.decoder = msgspec.msgpack.Decoder(DetokenizerOutputs)
         
         # ZMQ setup.
-        self.ctx = zmq.asyncio.Context(io_threads=2)
+        self.ctx = zmq.asyncio.Context(4)
 
         # Get input (DetokenizerRequest) to Detokenizer.
         input_path = get_open_zmq_ipc_path()
@@ -542,5 +546,7 @@ async def add_request_async(self, request: DetokenizerRequest):
     async def get_output_async(self) -> DetokenizerOutputs:
         """Get RequestOutputs, RequestsToAbort from Detokenizer."""
 
-        (frame, ) = await self.output_socket.recv_multipart(copy=False)
-        return self.decoder.decode(frame.buffer)
+        # (frame, ) = await self.output_socket.recv_multipart(copy=False)
+        # return self.decoder.decode(frame.buffer)
+        msg = await self.output_socket.recv()
+        return self.decoder.decode(msg)

From e3d6b0e3cfc6ed3572c8a99ca8e5ed0dbdffe253 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 19 Dec 2024 04:22:58 +0000
Subject: [PATCH 034/132] stash

---
 vllm/v1/engine/__init__.py  | 2 +-
 vllm/v1/engine/async_llm.py | 4 ++--
 vllm/v1/engine/core.py      | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 860662602160a..f81869a46b837 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -89,7 +89,7 @@ class EngineCoreOutput(
 
     request_id: str
     prompt_token_ids: List[int]
-    prompt: str
+    prompt: Optional[str]
     new_token_ids: List[int]
     finished: bool
     finish_reason: Optional[str] = None
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a0ed047f630f1..e95ee059853b6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -328,8 +328,8 @@ async def _run_output_handler(self):
 
         try:
             while True:
-                logger.info(f"EPOCH: {epoch}")
-                epoch += 1
+                # logger.info(f"EPOCH: {epoch}")
+                # epoch += 1
 
                 # 1) Pull outputs from the Detokenizer.
                 detokenizer_outputs = (
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8616141c4cec7..2957039c8fa19 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -248,8 +248,8 @@ def run_busy_loop(self):
 
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
-            logger.info(f"EPOCH: {epoch}")
-            epoch += 1
+            # logger.info(f"EPOCH: {epoch}")
+            # epoch += 1
 
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():

From 2c0a7939484892b6ab76ddf58c71291db75ab9d4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 20 Dec 2024 03:28:21 +0000
Subject: [PATCH 035/132] yay

---
 benchmarks/benchmark_throughput.py |  3 +
 vllm/v1/engine/async_llm.py        | 93 ++++++++++++++++--------------
 vllm/v1/engine/core_client.py      |  1 +
 vllm/v1/engine/detokenizer.py      | 17 +++---
 4 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 0926cec29a907..12ee9798d827e 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -207,6 +207,9 @@ async def run_vllm_async(
         # async for i, res in all_gens:
         #     pass
 
+        from aiodebug import log_slow_callbacks
+        loop = asyncio.get_event_loop()
+        log_slow_callbacks.enable(0.05)
         await asyncio.gather(*tasks)
 
         end = time.perf_counter()
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index e95ee059853b6..88de4d114446a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -115,6 +115,7 @@ def __init__(
 
         # self.output_handler: Optional[asyncio.Task] = None
         self.to_create_loop = True
+        self.epoch = 0
 
     def __del__(self):
         self.shutdown()
@@ -246,9 +247,17 @@ async def generate(
         # to handle startup failure gracefully in the OpenAI server.
         # if self.output_handler is None:
         if self.to_create_loop:
+            
+            import signal
+            def signal_handler(self, signum=None, frame=None):
+                logger.warning(
+                    f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
+            )
+
+            self.to_create_loop = False
             loop = asyncio.get_event_loop()
             loop.create_task(self._run_output_handler())
-            self.to_create_loop = False
+            loop.add_signal_handler(signal.SIGTERM, signal_handler)
 
         state = await self.add_request(
             request_id,
@@ -275,13 +284,13 @@ async def generate(
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
-                logger.debug("Timeout waiting for %s", request_id)
+                # logger.debug("Timeout waiting for %s", request_id)
                 continue
                 
             out = state.out_list[-1]
             if len(state.out_list) > 2:
                 logger.info(f"{len(state.out_list)=}")
-                self.warned = True
+            
 
             state.out_list = []
             if out.finished:
@@ -324,46 +333,44 @@ async def generate(
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
-        epoch = 0
-
-        try:
-            while True:
-                # logger.info(f"EPOCH: {epoch}")
-                # epoch += 1
-
-                # 1) Pull outputs from the Detokenizer.
-                detokenizer_outputs = (
-                    await self.detokenizer.get_output_async()).outputs
-
-                for detok_out in detokenizer_outputs:
-                    if detok_out.request_id not in self.rid_to_state:
-                        raise RuntimeError(f"{detok_out.request_id} "
-                                            "not in RequestStates")
-
-                    state = self.rid_to_state[detok_out.request_id]
-
-                    out = RequestOutput.from_detok(
-                        state.prompt,
-                        state.prompt_token_ids,
-                        detok_out,
-                    )
-
-                    # Update the RequestState and alert generate() that there
-                    # is a RequestOutput ready to return to the user.
-                    
-                    state.out_list.append(out)
-                    state.event.set()
-
-                # 3) Abort any requests that finished due to stop strings.
-                # await self.engine_core.abort_requests_async(reqs_to_abort)
-
-                # 4) Abort any requests due to client cancellations.
-                # TODO: send back to detokenizer if this fails.
-                # await self._process_cancellations()
-
-        except Exception as e:
-            logger.error(e)
-            raise e
+        # epoch = 0
+
+        while True:
+            # logger.info(f"EPOCH: {epoch}")
+            # self.warned = False
+            # if self.epoch % 10 == 0:
+            #     logger.info(f"\n{self.epoch=}\n")
+            # self.epoch += 1
+
+            # 1) Pull outputs from the Detokenizer.
+            detokenizer_outputs = (
+                await self.detokenizer.get_output_async()).outputs
+
+            for detok_out in detokenizer_outputs:
+                if detok_out.request_id not in self.rid_to_state:
+                    raise RuntimeError(f"{detok_out.request_id} "
+                                        "not in RequestStates")
+
+                state = self.rid_to_state[detok_out.request_id]
+
+                out = RequestOutput.from_detok(
+                    state.prompt,
+                    state.prompt_token_ids,
+                    detok_out,
+                )
+
+                # Update the RequestState and alert generate() that there
+                # is a RequestOutput ready to return to the user.
+                
+                state.out_list.append(out)
+                state.event.set()
+
+            # 3) Abort any requests that finished due to stop strings.
+            # await self.engine_core.abort_requests_async(reqs_to_abort)
+
+            # 4) Abort any requests due to client cancellations.
+            # TODO: send back to detokenizer if this fails.
+            # await self._process_cancellations()
 
     async def abort(self, request_id: str) -> None:
         # Note: this is not used outside of testing.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4c0745060bc02..7559ca1af2a03 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -140,6 +140,7 @@ def __init__(
 
         # ZMQ setup.
         if asyncio_mode:
+            print("HERE HERE HERE")
             self.ctx = zmq.asyncio.Context(io_threads=2)
         else:
             self.ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 141be5e81f589..a355be676d5e5 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -468,8 +468,8 @@ def run_busy_loop(self):
 
                 # idx = 0
                 while True:
-                    logger.info(f"EPOCH: {epoch}")
-                    epoch += 1
+                    # logger.info(f"EPOCH: {epoch}")
+                    # epoch += 1
 
                     socks = dict(poller.poll())
 
@@ -484,9 +484,10 @@ def run_busy_loop(self):
                         (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False)
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
                         detokenizer_outputs, _ = self.step(engine_core_outputs)
-                        msg = encoder.encode(detokenizer_outputs)
-                        # output_socket.send_multipart((msg, ), copy=False)
-                        output_socket.send(msg)
+                        # msg = encoder.encode(detokenizer_outputs)
+                        # # output_socket.send_multipart((msg, ), copy=False)
+                        # output_socket.send(msg)
+                        output_socket.send_pyobj(detokenizer_outputs)
         
         except Exception as e:
             logger.error(e)
@@ -502,7 +503,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
         self.decoder = msgspec.msgpack.Decoder(DetokenizerOutputs)
         
         # ZMQ setup.
-        self.ctx = zmq.asyncio.Context(4)
+        self.ctx = zmq.asyncio.Context(2)
 
         # Get input (DetokenizerRequest) to Detokenizer.
         input_path = get_open_zmq_ipc_path()
@@ -548,5 +549,5 @@ async def get_output_async(self) -> DetokenizerOutputs:
 
         # (frame, ) = await self.output_socket.recv_multipart(copy=False)
         # return self.decoder.decode(frame.buffer)
-        msg = await self.output_socket.recv()
-        return self.decoder.decode(msg)
+        return await self.output_socket.recv_pyobj()
+        # return self.decoder.decode(msg)

From ee791b21dddeed6deb39b68e3a65f2c6bcf54217 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 20 Dec 2024 03:44:26 +0000
Subject: [PATCH 036/132] no more preemptions

---
 vllm/v1/core/scheduler.py   | 1 +
 vllm/v1/engine/async_llm.py | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 23dc4ef298fea..c83c931f75fea 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -140,6 +140,7 @@ def schedule(self) -> "SchedulerOutput":
                     preempted_req.status = RequestStatus.PREEMPTED
                     preempted_req.num_computed_tokens = 0
 
+                    logger.info(f"Preempted: {preempted_req.request_id}")
                     self.waiting.appendleft(preempted_req)
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 88de4d114446a..ba48c7d6f1761 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -288,9 +288,8 @@ def signal_handler(self, signum=None, frame=None):
                 continue
                 
             out = state.out_list[-1]
-            if len(state.out_list) > 2:
+            if len(state.out_list) > 1:
                 logger.info(f"{len(state.out_list)=}")
-            
 
             state.out_list = []
             if out.finished:

From 37135028988d085dc3af964f6c097707f51670fe Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 17:13:33 +0000
Subject: [PATCH 037/132] stash current state of async llm

---
 vllm/entrypoints/openai/api_server.py         |  6 +++-
 vllm/entrypoints/openai/serving_completion.py | 14 ++++++++
 vllm/v1/engine/async_llm.py                   | 30 +++++++----------
 vllm/v1/engine/core.py                        |  7 ++--
 vllm/v1/engine/detokenizer.py                 | 32 +++++++------------
 5 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 14e3a34ce141c..f301ada394000 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -374,12 +374,15 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 @router.post("/v1/completions")
 async def create_completion(request: CompletionRequest, raw_request: Request):
+    raw_request.app.count += 1
+    should_profile = raw_request.app.count == 500
     handler = completion(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Completions API")
 
-    generator = await handler.create_completion(request, raw_request)
+    generator = await handler.create_completion(request, raw_request, 
+                                                should_profile=should_profile)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -494,6 +497,7 @@ def build_app(args: Namespace) -> FastAPI:
         app = FastAPI(lifespan=lifespan)
     app.include_router(router)
     app.root_path = args.root_path
+    app.count = 0
 
     mount_metrics(app)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 3b0bc2fa897c6..46c820fb5a794 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -61,6 +61,7 @@ async def create_completion(
         self,
         request: CompletionRequest,
         raw_request: Request,
+        should_profile: bool=False
     ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
         """Completion API similar to OpenAI's API.
 
@@ -188,6 +189,7 @@ async def create_completion(
                 tokenizer=tokenizer,
                 request_metadata=request_metadata,
                 output_kind=sampling_params.output_kind,
+                should_profile=should_profile,
             )
 
         # Non-streaming response
@@ -247,6 +249,7 @@ async def completion_stream_generator(
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
         output_kind: RequestOutputKind,
+        should_profile: bool = False,
     ) -> AsyncGenerator[str, None]:
         """
         In V0, we use RequestOutputType.DELTA and each RequestOutput
@@ -268,6 +271,12 @@ async def completion_stream_generator(
             such that the API server falls behind, we dynamically fall back
             to streaming chunks of tokens.
         """
+        if should_profile:
+            from pyinstrument import Profiler
+            print("STARTING PROFILER")
+            profiler = Profiler(async_mode="disabled")
+            profiler.start()
+
         assert (output_kind == RequestOutputKind.CUMULATIVE
                 or output_kind == RequestOutputKind.DELTA)
 
@@ -293,6 +302,10 @@ async def completion_stream_generator(
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt
 
+                if res.finished and should_profile:
+                    profiler.stop()
+                    profiler.write_html("task-disabled.html")
+
                 # Prompt details are excluded from later streamed outputs
                 if res.prompt_token_ids is not None:
                     num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
@@ -419,6 +432,7 @@ async def completion_stream_generator(
             yield f"data: {data}\n\n"
         yield "data: [DONE]\n\n"
 
+
     def request_output_to_completion_response(
         self,
         final_res_batch: List[RequestOutput],
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ba48c7d6f1761..09ee89e645a44 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -281,16 +281,15 @@ def signal_handler(self, signum=None, frame=None):
                 # a delta text. This way we do "dynamic chunked streaming", such
                 # that the API client does not fall behind the EngineCor,
                 # which happens at high QPS otherwise.
+                out = state.out_list[-1]
+                if len(state.out_list) > 1:
+                    logger.info(f"{len(state.out_list)=}")
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
                 # logger.debug("Timeout waiting for %s", request_id)
                 continue
                 
-            out = state.out_list[-1]
-            if len(state.out_list) > 1:
-                logger.info(f"{len(state.out_list)=}")
-
             state.out_list = []
             if out.finished:
                 del self.rid_to_state[request_id]
@@ -332,31 +331,24 @@ def signal_handler(self, signum=None, frame=None):
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
-        # epoch = 0
+        epoch = 0
 
         while True:
-            # logger.info(f"EPOCH: {epoch}")
+            logger.info(f"EPOCH: {epoch}")
+            epoch += 1
             # self.warned = False
             # if self.epoch % 10 == 0:
             #     logger.info(f"\n{self.epoch=}\n")
-            # self.epoch += 1
 
             # 1) Pull outputs from the Detokenizer.
-            detokenizer_outputs = (
-                await self.detokenizer.get_output_async()).outputs
+            outputs = await self.detokenizer.output_socket.recv_pyobj()
 
-            for detok_out in detokenizer_outputs:
-                if detok_out.request_id not in self.rid_to_state:
-                    raise RuntimeError(f"{detok_out.request_id} "
+            for out in outputs:
+                if out.request_id not in self.rid_to_state:
+                    raise RuntimeError(f"{out.request_id} "
                                         "not in RequestStates")
 
-                state = self.rid_to_state[detok_out.request_id]
-
-                out = RequestOutput.from_detok(
-                    state.prompt,
-                    state.prompt_token_ids,
-                    detok_out,
-                )
+                state = self.rid_to_state[out.request_id]
 
                 # Update the RequestState and alert generate() that there
                 # is a RequestOutput ready to return to the user.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 2957039c8fa19..f8aef10908514 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -244,12 +244,11 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
         
-        epoch = 0
-
         # Loop until process is sent a SIGINT or SIGTERM
+        epoch = 0
         while True:
-            # logger.info(f"EPOCH: {epoch}")
-            # epoch += 1
+            logger.info(f"EPOCH: {epoch}")
+            epoch += 1
 
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index a355be676d5e5..8db857bca9b72 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -293,9 +293,9 @@ def step(
     ) -> DetokenizerOutputs:
         """Update state and request the RequestOutputs to the LLMEngine."""
 
-        # request_outputs: List[RequestOutput] = []
+        request_outputs: List[RequestOutput] = []
         # requests_to_abort: List[str] = []
-        detokenizer_outputs = DetokenizerOutputs(outputs=[])
+        # detokenizer_outputs = DetokenizerOutputs(outputs=[])
 
         for engine_core_output in encore_core_outputs:
             request_id = engine_core_output.request_id
@@ -316,16 +316,8 @@ def step(
             )
             
             if request_output is not None:
-                detokenizer_outputs.outputs.append(
-                    DetokenizerOutput(
-                        request_id=request_id,
-                        token_ids=request_output.outputs[0].token_ids,
-                        text=request_output.outputs[0].text,
-                        finished=request_output.finished,
-                    )
-                )   
-                # # Add to RequestOutputs list.
-                # request_outputs.append(request_output)
+                # Add to RequestOutputs list.
+                request_outputs.append(request_output)
 
                 # # Free completed requests.
                 # if request_output.finished:
@@ -338,7 +330,7 @@ def step(
 
         # Return to EngineClient.
         # return request_outputs, requests_to_abort
-        return detokenizer_outputs, []
+        return request_outputs, []
 
 class DetokenizerProc(Detokenizer):
     """ZMQ-wrapper for running Detokenizer in background process."""
@@ -456,8 +448,6 @@ def run_busy_loop(self):
             with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as engine_core_outputs_socket, 
                   zmq_socket_ctx(self.input_path, zmq.PULL) as input_socket,
                   zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket):
-                
-                epoch = 0
 
                 # TODO: avoid poll by having both EngineCore
                 # and AsyncLLM send to the same socket (unclear why this 
@@ -466,10 +456,10 @@ def run_busy_loop(self):
                 poller.register(engine_core_outputs_socket, zmq.POLLIN)
                 poller.register(input_socket, zmq.POLLIN)
 
-                # idx = 0
+                epoch = 0
                 while True:
-                    # logger.info(f"EPOCH: {epoch}")
-                    # epoch += 1
+                    logger.info(f"EPOCH: {epoch}")
+                    epoch += 1
 
                     socks = dict(poller.poll())
 
@@ -483,11 +473,11 @@ def run_busy_loop(self):
                     if engine_core_outputs_socket in socks:
                         (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False)
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
-                        detokenizer_outputs, _ = self.step(engine_core_outputs)
+                        request_outputs, _ = self.step(engine_core_outputs)
                         # msg = encoder.encode(detokenizer_outputs)
                         # # output_socket.send_multipart((msg, ), copy=False)
                         # output_socket.send(msg)
-                        output_socket.send_pyobj(detokenizer_outputs)
+                        output_socket.send_pyobj(request_outputs)
         
         except Exception as e:
             logger.error(e)
@@ -544,7 +534,7 @@ async def add_request_async(self, request: DetokenizerRequest):
         msg = (self.encoder.encode(request), )
         await self.input_socket.send_multipart(msg, copy=False)
 
-    async def get_output_async(self) -> DetokenizerOutputs:
+    async def get_output_async(self) -> List[RequestOutput]:
         """Get RequestOutputs, RequestsToAbort from Detokenizer."""
 
         # (frame, ) = await self.output_socket.recv_multipart(copy=False)

From bcd45be052c88e965486f8eeed27537bbc098234 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 18:30:34 +0000
Subject: [PATCH 038/132] stash profile'

---
 vllm/entrypoints/openai/api_server.py         | 445 +++++++++---------
 vllm/entrypoints/openai/serving_completion.py |   7 +-
 2 files changed, 228 insertions(+), 224 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f301ada394000..fe312c3f96cdb 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -245,7 +245,7 @@ def _cleanup_ipc_path():
             multiprocess.mark_process_dead(engine_process.pid)
 
 
-router = APIRouter()
+# router = APIRouter()
 
 
 def mount_metrics(app: FastAPI):
@@ -303,253 +303,254 @@ def engine_client(request: Request) -> EngineClient:
     return request.app.state.engine_client
 
 
-@router.get("/health")
-async def health(raw_request: Request) -> Response:
-    """Health check."""
-    await engine_client(raw_request).check_health()
-    return Response(status_code=200)
-
-
-@router.post("/tokenize")
-async def tokenize(request: TokenizeRequest, raw_request: Request):
-    handler = tokenization(raw_request)
-
-    generator = await handler.create_tokenize(request, raw_request)
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
-    elif isinstance(generator, TokenizeResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post("/detokenize")
-async def detokenize(request: DetokenizeRequest, raw_request: Request):
-    handler = tokenization(raw_request)
-
-    generator = await handler.create_detokenize(request, raw_request)
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
-    elif isinstance(generator, DetokenizeResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.get("/v1/models")
-async def show_available_models(raw_request: Request):
-    handler = base(raw_request)
-
-    models = await handler.show_available_models()
-    return JSONResponse(content=models.model_dump())
-
-
-@router.get("/version")
-async def show_version():
-    ver = {"version": VLLM_VERSION}
-    return JSONResponse(content=ver)
-
-
-@router.post("/v1/chat/completions")
-async def create_chat_completion(request: ChatCompletionRequest,
-                                 raw_request: Request):
-    handler = chat(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Chat Completions API")
-
-    generator = await handler.create_chat_completion(request, raw_request)
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
-
-    elif isinstance(generator, ChatCompletionResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
-
-
-@router.post("/v1/completions")
-async def create_completion(request: CompletionRequest, raw_request: Request):
-    raw_request.app.count += 1
-    should_profile = raw_request.app.count == 500
-    handler = completion(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Completions API")
-
-    generator = await handler.create_completion(request, raw_request, 
-                                                should_profile=should_profile)
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
-    elif isinstance(generator, CompletionResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
+def build_app(args: Namespace) -> FastAPI:
+    if args.disable_fastapi_docs:
+        app = FastAPI(openapi_url=None,
+                      docs_url=None,
+                      redoc_url=None,
+                      lifespan=lifespan)
+    else:
+        app = FastAPI(lifespan=lifespan)
+    # app.include_router(router)
+    app.root_path = args.root_path
+    app.count = 0
 
+    mount_metrics(app)
 
-@router.post("/v1/embeddings")
-async def create_embedding(request: EmbeddingRequest, raw_request: Request):
-    handler = embedding(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Embeddings API")
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=args.allowed_origins,
+        allow_credentials=args.allow_credentials,
+        allow_methods=args.allowed_methods,
+        allow_headers=args.allowed_headers,
+    )
 
-    generator = await handler.create_embedding(request, raw_request)
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
-    elif isinstance(generator, EmbeddingResponse):
-        return JSONResponse(content=generator.model_dump())
+    @app.get("/health")
+    async def health(raw_request: Request) -> Response:
+        """Health check."""
+        await engine_client(raw_request).check_health()
+        return Response(status_code=200)
 
-    assert_never(generator)
 
+    @app.post("/tokenize")
+    async def tokenize(request: TokenizeRequest, raw_request: Request):
+        handler = tokenization(raw_request)
 
-@router.post("/score")
-async def create_score(request: ScoreRequest, raw_request: Request):
-    handler = score(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Score API")
+        generator = await handler.create_tokenize(request, raw_request)
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(),
+                                status_code=generator.code)
+        elif isinstance(generator, TokenizeResponse):
+            return JSONResponse(content=generator.model_dump())
 
-    generator = await handler.create_score(request, raw_request)
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
-    elif isinstance(generator, ScoreResponse):
-        return JSONResponse(content=generator.model_dump())
+        assert_never(generator)
 
-    assert_never(generator)
 
+    @app.post("/detokenize")
+    async def detokenize(request: DetokenizeRequest, raw_request: Request):
+        handler = tokenization(raw_request)
 
-@router.post("/v1/score")
-async def create_score_v1(request: ScoreRequest, raw_request: Request):
-    logger.warning(
-        "To indicate that Score API is not part of standard OpenAI API, we "
-        "have moved it to `/score`. Please update your client accordingly.")
+        generator = await handler.create_detokenize(request, raw_request)
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(),
+                                status_code=generator.code)
+        elif isinstance(generator, DetokenizeResponse):
+            return JSONResponse(content=generator.model_dump())
 
-    return await create_score(request, raw_request)
+        assert_never(generator)
 
 
-if envs.VLLM_TORCH_PROFILER_DIR:
-    logger.warning(
-        "Torch Profiler is enabled in the API server. This should ONLY be "
-        "used for local development!")
+    @app.get("/v1/models")
+    async def show_available_models(raw_request: Request):
+        handler = base(raw_request)
 
-    @router.post("/start_profile")
-    async def start_profile(raw_request: Request):
-        logger.info("Starting profiler...")
-        await engine_client(raw_request).start_profile()
-        logger.info("Profiler started.")
-        return Response(status_code=200)
+        models = await handler.show_available_models()
+        return JSONResponse(content=models.model_dump())
 
-    @router.post("/stop_profile")
-    async def stop_profile(raw_request: Request):
-        logger.info("Stopping profiler...")
-        await engine_client(raw_request).stop_profile()
-        logger.info("Profiler stopped.")
-        return Response(status_code=200)
 
+    @app.get("/version")
+    async def show_version():
+        ver = {"version": VLLM_VERSION}
+        return JSONResponse(content=ver)
 
-if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-    logger.warning(
-        "Lora dynamic loading & unloading is enabled in the API server. "
-        "This should ONLY be used for local development!")
 
-    @router.post("/v1/load_lora_adapter")
-    async def load_lora_adapter(request: LoadLoraAdapterRequest,
-                                raw_request: Request):
-        for route in [chat, completion, embedding]:
-            handler = route(raw_request)
-            if handler is not None:
-                response = await handler.load_lora_adapter(request)
-                if isinstance(response, ErrorResponse):
-                    return JSONResponse(content=response.model_dump(),
-                                        status_code=response.code)
+    @app.post("/v1/chat/completions")
+    async def create_chat_completion(request: ChatCompletionRequest,
+                                    raw_request: Request):
+        handler = chat(raw_request)
+        if handler is None:
+            return base(raw_request).create_error_response(
+                message="The model does not support Chat Completions API")
 
-        return Response(status_code=200, content=response)
+        generator = await handler.create_chat_completion(request, raw_request)
 
-    @router.post("/v1/unload_lora_adapter")
-    async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
-                                  raw_request: Request):
-        for route in [chat, completion, embedding]:
-            handler = route(raw_request)
-            if handler is not None:
-                response = await handler.unload_lora_adapter(request)
-                if isinstance(response, ErrorResponse):
-                    return JSONResponse(content=response.model_dump(),
-                                        status_code=response.code)
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(),
+                                status_code=generator.code)
 
-        return Response(status_code=200, content=response)
+        elif isinstance(generator, ChatCompletionResponse):
+            return JSONResponse(content=generator.model_dump())
 
+        return StreamingResponse(content=generator, media_type="text/event-stream")
 
-def build_app(args: Namespace) -> FastAPI:
-    if args.disable_fastapi_docs:
-        app = FastAPI(openapi_url=None,
-                      docs_url=None,
-                      redoc_url=None,
-                      lifespan=lifespan)
-    else:
-        app = FastAPI(lifespan=lifespan)
-    app.include_router(router)
-    app.root_path = args.root_path
-    app.count = 0
 
-    mount_metrics(app)
+    @app.post("/v1/completions")
+    async def create_completion(request: CompletionRequest, raw_request: Request):
+        raw_request.app.count += 1
+        should_profile = raw_request.app.count == 500
+        print(f"{should_profile=}")
+        handler = completion(raw_request)
+        if handler is None:
+            return base(raw_request).create_error_response(
+                message="The model does not support Completions API")
 
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=args.allowed_origins,
-        allow_credentials=args.allow_credentials,
-        allow_methods=args.allowed_methods,
-        allow_headers=args.allowed_headers,
-    )
+        generator = await handler.create_completion(request, raw_request, 
+                                                    should_profile=should_profile)
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(),
+                                status_code=generator.code)
+        elif isinstance(generator, CompletionResponse):
+            return JSONResponse(content=generator.model_dump())
 
-    @app.exception_handler(RequestValidationError)
-    async def validation_exception_handler(_, exc):
-        err = ErrorResponse(message=str(exc),
-                            type="BadRequestError",
-                            code=HTTPStatus.BAD_REQUEST)
-        return JSONResponse(err.model_dump(),
-                            status_code=HTTPStatus.BAD_REQUEST)
-
-    if token := envs.VLLM_API_KEY or args.api_key:
-
-        @app.middleware("http")
-        async def authentication(request: Request, call_next):
-            if request.method == "OPTIONS":
-                return await call_next(request)
-            url_path = request.url.path
-            if app.root_path and url_path.startswith(app.root_path):
-                url_path = url_path[len(app.root_path):]
-            if not url_path.startswith("/v1"):
-                return await call_next(request)
-            if request.headers.get("Authorization") != "Bearer " + token:
-                return JSONResponse(content={"error": "Unauthorized"},
-                                    status_code=401)
-            return await call_next(request)
-
-    @app.middleware("http")
-    async def add_request_id(request: Request, call_next):
-        request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
-        response = await call_next(request)
-        response.headers["X-Request-Id"] = request_id
-        return response
-
-    for middleware in args.middleware:
-        module_path, object_name = middleware.rsplit(".", 1)
-        imported = getattr(importlib.import_module(module_path), object_name)
-        if inspect.isclass(imported):
-            app.add_middleware(imported)
-        elif inspect.iscoroutinefunction(imported):
-            app.middleware("http")(imported)
-        else:
-            raise ValueError(f"Invalid middleware {middleware}. "
-                             f"Must be a function or a class.")
+        return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+    @app.post("/v1/embeddings")
+    async def create_embedding(request: EmbeddingRequest, raw_request: Request):
+        handler = embedding(raw_request)
+        if handler is None:
+            return base(raw_request).create_error_response(
+                message="The model does not support Embeddings API")
+
+        generator = await handler.create_embedding(request, raw_request)
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(),
+                                status_code=generator.code)
+        elif isinstance(generator, EmbeddingResponse):
+            return JSONResponse(content=generator.model_dump())
+
+        assert_never(generator)
+
+
+# @router.post("/score")
+# async def create_score(request: ScoreRequest, raw_request: Request):
+#     handler = score(raw_request)
+#     if handler is None:
+#         return base(raw_request).create_error_response(
+#             message="The model does not support Score API")
+
+#     generator = await handler.create_score(request, raw_request)
+#     if isinstance(generator, ErrorResponse):
+#         return JSONResponse(content=generator.model_dump(),
+#                             status_code=generator.code)
+#     elif isinstance(generator, ScoreResponse):
+#         return JSONResponse(content=generator.model_dump())
+
+#     assert_never(generator)
+
+
+# @router.post("/v1/score")
+# async def create_score_v1(request: ScoreRequest, raw_request: Request):
+#     logger.warning(
+#         "To indicate that Score API is not part of standard OpenAI API, we "
+#         "have moved it to `/score`. Please update your client accordingly.")
+
+#     return await create_score(request, raw_request)
+
+
+# if envs.VLLM_TORCH_PROFILER_DIR:
+#     logger.warning(
+#         "Torch Profiler is enabled in the API server. This should ONLY be "
+#         "used for local development!")
+
+#     @router.post("/start_profile")
+#     async def start_profile(raw_request: Request):
+#         logger.info("Starting profiler...")
+#         await engine_client(raw_request).start_profile()
+#         logger.info("Profiler started.")
+#         return Response(status_code=200)
+
+#     @router.post("/stop_profile")
+#     async def stop_profile(raw_request: Request):
+#         logger.info("Stopping profiler...")
+#         await engine_client(raw_request).stop_profile()
+#         logger.info("Profiler stopped.")
+#         return Response(status_code=200)
+
+
+# if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+#     logger.warning(
+#         "Lora dynamic loading & unloading is enabled in the API server. "
+#         "This should ONLY be used for local development!")
+
+#     @router.post("/v1/load_lora_adapter")
+#     async def load_lora_adapter(request: LoadLoraAdapterRequest,
+#                                 raw_request: Request):
+#         for route in [chat, completion, embedding]:
+#             handler = route(raw_request)
+#             if handler is not None:
+#                 response = await handler.load_lora_adapter(request)
+#                 if isinstance(response, ErrorResponse):
+#                     return JSONResponse(content=response.model_dump(),
+#                                         status_code=response.code)
+
+#         return Response(status_code=200, content=response)
+
+#     @router.post("/v1/unload_lora_adapter")
+#     async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
+#                                   raw_request: Request):
+#         for route in [chat, completion, embedding]:
+#             handler = route(raw_request)
+#             if handler is not None:
+#                 response = await handler.unload_lora_adapter(request)
+#                 if isinstance(response, ErrorResponse):
+#                     return JSONResponse(content=response.model_dump(),
+#                                         status_code=response.code)
+
+#         return Response(status_code=200, content=response)
+
+    # @app.exception_handler(RequestValidationError)
+    # async def validation_exception_handler(_, exc):
+    #     err = ErrorResponse(message=str(exc),
+    #                         type="BadRequestError",
+    #                         code=HTTPStatus.BAD_REQUEST)
+    #     return JSONResponse(err.model_dump(),
+    #                         status_code=HTTPStatus.BAD_REQUEST)
+
+    # if token := envs.VLLM_API_KEY or args.api_key:
+
+    #     @app.middleware("http")
+    #     async def authentication(request: Request, call_next):
+    #         if request.method == "OPTIONS":
+    #             return await call_next(request)
+    #         url_path = request.url.path
+    #         if app.root_path and url_path.startswith(app.root_path):
+    #             url_path = url_path[len(app.root_path):]
+    #         if not url_path.startswith("/v1"):
+    #             return await call_next(request)
+    #         if request.headers.get("Authorization") != "Bearer " + token:
+    #             return JSONResponse(content={"error": "Unauthorized"},
+    #                                 status_code=401)
+    #         return await call_next(request)
+
+    # @app.middleware("http")
+    # async def add_request_id(request: Request, call_next):
+    #     request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
+    #     response = await call_next(request)
+    #     response.headers["X-Request-Id"] = request_id
+    #     return response
+
+    # print(f"{args.middleware=}")
+    # for middleware in args.middleware:
+    #     module_path, object_name = middleware.rsplit(".", 1)
+    #     imported = getattr(importlib.import_module(module_path), object_name)
+    #     if inspect.isclass(imported):
+    #         app.add_middleware(imported)
+    #     elif inspect.iscoroutinefunction(imported):
+    #         app.middleware("http")(imported)
+    #     else:
+    #         raise ValueError(f"Invalid middleware {middleware}. "
+    #                          f"Must be a function or a class.")
 
     return app
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 46c820fb5a794..a4f9f6b9af536 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -6,6 +6,7 @@
 
 from fastapi import Request
 
+from pyinstrument import Profiler
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
@@ -72,6 +73,7 @@ async def create_completion(
             - suffix (the language models we currently support do not support
             suffix)
         """
+
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -271,8 +273,9 @@ async def completion_stream_generator(
             such that the API server falls behind, we dynamically fall back
             to streaming chunks of tokens.
         """
+        print(f"GENERATOR: {should_profile=}")
+        should_profile=False
         if should_profile:
-            from pyinstrument import Profiler
             print("STARTING PROFILER")
             profiler = Profiler(async_mode="disabled")
             profiler.start()
@@ -304,7 +307,7 @@ async def completion_stream_generator(
 
                 if res.finished and should_profile:
                     profiler.stop()
-                    profiler.write_html("task-disabled.html")
+                    profiler.write_html("vllm-proxy.html")
 
                 # Prompt details are excluded from later streamed outputs
                 if res.prompt_token_ids is not None:

From 2c067956128663e087d2cd68cb500f50007ada28 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 18:32:34 +0000
Subject: [PATCH 039/132] Revert "stash profile'"

This reverts commit bcd45be052c88e965486f8eeed27537bbc098234.
---
 vllm/entrypoints/openai/api_server.py         | 445 +++++++++---------
 vllm/entrypoints/openai/serving_completion.py |   7 +-
 2 files changed, 224 insertions(+), 228 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index fe312c3f96cdb..f301ada394000 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -245,7 +245,7 @@ def _cleanup_ipc_path():
             multiprocess.mark_process_dead(engine_process.pid)
 
 
-# router = APIRouter()
+router = APIRouter()
 
 
 def mount_metrics(app: FastAPI):
@@ -303,254 +303,253 @@ def engine_client(request: Request) -> EngineClient:
     return request.app.state.engine_client
 
 
-def build_app(args: Namespace) -> FastAPI:
-    if args.disable_fastapi_docs:
-        app = FastAPI(openapi_url=None,
-                      docs_url=None,
-                      redoc_url=None,
-                      lifespan=lifespan)
-    else:
-        app = FastAPI(lifespan=lifespan)
-    # app.include_router(router)
-    app.root_path = args.root_path
-    app.count = 0
+@router.get("/health")
+async def health(raw_request: Request) -> Response:
+    """Health check."""
+    await engine_client(raw_request).check_health()
+    return Response(status_code=200)
 
-    mount_metrics(app)
 
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=args.allowed_origins,
-        allow_credentials=args.allow_credentials,
-        allow_methods=args.allowed_methods,
-        allow_headers=args.allowed_headers,
-    )
+@router.post("/tokenize")
+async def tokenize(request: TokenizeRequest, raw_request: Request):
+    handler = tokenization(raw_request)
 
-    @app.get("/health")
-    async def health(raw_request: Request) -> Response:
-        """Health check."""
-        await engine_client(raw_request).check_health()
-        return Response(status_code=200)
+    generator = await handler.create_tokenize(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, TokenizeResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/detokenize")
+async def detokenize(request: DetokenizeRequest, raw_request: Request):
+    handler = tokenization(raw_request)
+
+    generator = await handler.create_detokenize(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, DetokenizeResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
 
 
-    @app.post("/tokenize")
-    async def tokenize(request: TokenizeRequest, raw_request: Request):
-        handler = tokenization(raw_request)
+@router.get("/v1/models")
+async def show_available_models(raw_request: Request):
+    handler = base(raw_request)
 
-        generator = await handler.create_tokenize(request, raw_request)
-        if isinstance(generator, ErrorResponse):
-            return JSONResponse(content=generator.model_dump(),
-                                status_code=generator.code)
-        elif isinstance(generator, TokenizeResponse):
-            return JSONResponse(content=generator.model_dump())
+    models = await handler.show_available_models()
+    return JSONResponse(content=models.model_dump())
 
-        assert_never(generator)
 
+@router.get("/version")
+async def show_version():
+    ver = {"version": VLLM_VERSION}
+    return JSONResponse(content=ver)
 
-    @app.post("/detokenize")
-    async def detokenize(request: DetokenizeRequest, raw_request: Request):
-        handler = tokenization(raw_request)
 
-        generator = await handler.create_detokenize(request, raw_request)
-        if isinstance(generator, ErrorResponse):
-            return JSONResponse(content=generator.model_dump(),
-                                status_code=generator.code)
-        elif isinstance(generator, DetokenizeResponse):
-            return JSONResponse(content=generator.model_dump())
+@router.post("/v1/chat/completions")
+async def create_chat_completion(request: ChatCompletionRequest,
+                                 raw_request: Request):
+    handler = chat(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Chat Completions API")
 
-        assert_never(generator)
+    generator = await handler.create_chat_completion(request, raw_request)
 
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
 
-    @app.get("/v1/models")
-    async def show_available_models(raw_request: Request):
-        handler = base(raw_request)
+    elif isinstance(generator, ChatCompletionResponse):
+        return JSONResponse(content=generator.model_dump())
 
-        models = await handler.show_available_models()
-        return JSONResponse(content=models.model_dump())
+    return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-    @app.get("/version")
-    async def show_version():
-        ver = {"version": VLLM_VERSION}
-        return JSONResponse(content=ver)
+@router.post("/v1/completions")
+async def create_completion(request: CompletionRequest, raw_request: Request):
+    raw_request.app.count += 1
+    should_profile = raw_request.app.count == 500
+    handler = completion(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Completions API")
 
+    generator = await handler.create_completion(request, raw_request, 
+                                                should_profile=should_profile)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, CompletionResponse):
+        return JSONResponse(content=generator.model_dump())
 
-    @app.post("/v1/chat/completions")
-    async def create_chat_completion(request: ChatCompletionRequest,
-                                    raw_request: Request):
-        handler = chat(raw_request)
-        if handler is None:
-            return base(raw_request).create_error_response(
-                message="The model does not support Chat Completions API")
+    return StreamingResponse(content=generator, media_type="text/event-stream")
 
-        generator = await handler.create_chat_completion(request, raw_request)
 
-        if isinstance(generator, ErrorResponse):
-            return JSONResponse(content=generator.model_dump(),
-                                status_code=generator.code)
+@router.post("/v1/embeddings")
+async def create_embedding(request: EmbeddingRequest, raw_request: Request):
+    handler = embedding(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Embeddings API")
 
-        elif isinstance(generator, ChatCompletionResponse):
-            return JSONResponse(content=generator.model_dump())
+    generator = await handler.create_embedding(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, EmbeddingResponse):
+        return JSONResponse(content=generator.model_dump())
 
-        return StreamingResponse(content=generator, media_type="text/event-stream")
+    assert_never(generator)
 
 
-    @app.post("/v1/completions")
-    async def create_completion(request: CompletionRequest, raw_request: Request):
-        raw_request.app.count += 1
-        should_profile = raw_request.app.count == 500
-        print(f"{should_profile=}")
-        handler = completion(raw_request)
-        if handler is None:
-            return base(raw_request).create_error_response(
-                message="The model does not support Completions API")
+@router.post("/score")
+async def create_score(request: ScoreRequest, raw_request: Request):
+    handler = score(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Score API")
 
-        generator = await handler.create_completion(request, raw_request, 
-                                                    should_profile=should_profile)
-        if isinstance(generator, ErrorResponse):
-            return JSONResponse(content=generator.model_dump(),
-                                status_code=generator.code)
-        elif isinstance(generator, CompletionResponse):
-            return JSONResponse(content=generator.model_dump())
+    generator = await handler.create_score(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, ScoreResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/v1/score")
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly.")
+
+    return await create_score(request, raw_request)
+
+
+if envs.VLLM_TORCH_PROFILER_DIR:
+    logger.warning(
+        "Torch Profiler is enabled in the API server. This should ONLY be "
+        "used for local development!")
+
+    @router.post("/start_profile")
+    async def start_profile(raw_request: Request):
+        logger.info("Starting profiler...")
+        await engine_client(raw_request).start_profile()
+        logger.info("Profiler started.")
+        return Response(status_code=200)
 
-        return StreamingResponse(content=generator, media_type="text/event-stream")
-
-
-    @app.post("/v1/embeddings")
-    async def create_embedding(request: EmbeddingRequest, raw_request: Request):
-        handler = embedding(raw_request)
-        if handler is None:
-            return base(raw_request).create_error_response(
-                message="The model does not support Embeddings API")
-
-        generator = await handler.create_embedding(request, raw_request)
-        if isinstance(generator, ErrorResponse):
-            return JSONResponse(content=generator.model_dump(),
-                                status_code=generator.code)
-        elif isinstance(generator, EmbeddingResponse):
-            return JSONResponse(content=generator.model_dump())
-
-        assert_never(generator)
-
-
-# @router.post("/score")
-# async def create_score(request: ScoreRequest, raw_request: Request):
-#     handler = score(raw_request)
-#     if handler is None:
-#         return base(raw_request).create_error_response(
-#             message="The model does not support Score API")
-
-#     generator = await handler.create_score(request, raw_request)
-#     if isinstance(generator, ErrorResponse):
-#         return JSONResponse(content=generator.model_dump(),
-#                             status_code=generator.code)
-#     elif isinstance(generator, ScoreResponse):
-#         return JSONResponse(content=generator.model_dump())
-
-#     assert_never(generator)
-
-
-# @router.post("/v1/score")
-# async def create_score_v1(request: ScoreRequest, raw_request: Request):
-#     logger.warning(
-#         "To indicate that Score API is not part of standard OpenAI API, we "
-#         "have moved it to `/score`. Please update your client accordingly.")
-
-#     return await create_score(request, raw_request)
-
-
-# if envs.VLLM_TORCH_PROFILER_DIR:
-#     logger.warning(
-#         "Torch Profiler is enabled in the API server. This should ONLY be "
-#         "used for local development!")
-
-#     @router.post("/start_profile")
-#     async def start_profile(raw_request: Request):
-#         logger.info("Starting profiler...")
-#         await engine_client(raw_request).start_profile()
-#         logger.info("Profiler started.")
-#         return Response(status_code=200)
-
-#     @router.post("/stop_profile")
-#     async def stop_profile(raw_request: Request):
-#         logger.info("Stopping profiler...")
-#         await engine_client(raw_request).stop_profile()
-#         logger.info("Profiler stopped.")
-#         return Response(status_code=200)
-
-
-# if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-#     logger.warning(
-#         "Lora dynamic loading & unloading is enabled in the API server. "
-#         "This should ONLY be used for local development!")
-
-#     @router.post("/v1/load_lora_adapter")
-#     async def load_lora_adapter(request: LoadLoraAdapterRequest,
-#                                 raw_request: Request):
-#         for route in [chat, completion, embedding]:
-#             handler = route(raw_request)
-#             if handler is not None:
-#                 response = await handler.load_lora_adapter(request)
-#                 if isinstance(response, ErrorResponse):
-#                     return JSONResponse(content=response.model_dump(),
-#                                         status_code=response.code)
-
-#         return Response(status_code=200, content=response)
-
-#     @router.post("/v1/unload_lora_adapter")
-#     async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
-#                                   raw_request: Request):
-#         for route in [chat, completion, embedding]:
-#             handler = route(raw_request)
-#             if handler is not None:
-#                 response = await handler.unload_lora_adapter(request)
-#                 if isinstance(response, ErrorResponse):
-#                     return JSONResponse(content=response.model_dump(),
-#                                         status_code=response.code)
-
-#         return Response(status_code=200, content=response)
-
-    # @app.exception_handler(RequestValidationError)
-    # async def validation_exception_handler(_, exc):
-    #     err = ErrorResponse(message=str(exc),
-    #                         type="BadRequestError",
-    #                         code=HTTPStatus.BAD_REQUEST)
-    #     return JSONResponse(err.model_dump(),
-    #                         status_code=HTTPStatus.BAD_REQUEST)
-
-    # if token := envs.VLLM_API_KEY or args.api_key:
-
-    #     @app.middleware("http")
-    #     async def authentication(request: Request, call_next):
-    #         if request.method == "OPTIONS":
-    #             return await call_next(request)
-    #         url_path = request.url.path
-    #         if app.root_path and url_path.startswith(app.root_path):
-    #             url_path = url_path[len(app.root_path):]
-    #         if not url_path.startswith("/v1"):
-    #             return await call_next(request)
-    #         if request.headers.get("Authorization") != "Bearer " + token:
-    #             return JSONResponse(content={"error": "Unauthorized"},
-    #                                 status_code=401)
-    #         return await call_next(request)
-
-    # @app.middleware("http")
-    # async def add_request_id(request: Request, call_next):
-    #     request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
-    #     response = await call_next(request)
-    #     response.headers["X-Request-Id"] = request_id
-    #     return response
-
-    # print(f"{args.middleware=}")
-    # for middleware in args.middleware:
-    #     module_path, object_name = middleware.rsplit(".", 1)
-    #     imported = getattr(importlib.import_module(module_path), object_name)
-    #     if inspect.isclass(imported):
-    #         app.add_middleware(imported)
-    #     elif inspect.iscoroutinefunction(imported):
-    #         app.middleware("http")(imported)
-    #     else:
-    #         raise ValueError(f"Invalid middleware {middleware}. "
-    #                          f"Must be a function or a class.")
+    @router.post("/stop_profile")
+    async def stop_profile(raw_request: Request):
+        logger.info("Stopping profiler...")
+        await engine_client(raw_request).stop_profile()
+        logger.info("Profiler stopped.")
+        return Response(status_code=200)
+
+
+if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+    logger.warning(
+        "Lora dynamic loading & unloading is enabled in the API server. "
+        "This should ONLY be used for local development!")
+
+    @router.post("/v1/load_lora_adapter")
+    async def load_lora_adapter(request: LoadLoraAdapterRequest,
+                                raw_request: Request):
+        for route in [chat, completion, embedding]:
+            handler = route(raw_request)
+            if handler is not None:
+                response = await handler.load_lora_adapter(request)
+                if isinstance(response, ErrorResponse):
+                    return JSONResponse(content=response.model_dump(),
+                                        status_code=response.code)
+
+        return Response(status_code=200, content=response)
+
+    @router.post("/v1/unload_lora_adapter")
+    async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
+                                  raw_request: Request):
+        for route in [chat, completion, embedding]:
+            handler = route(raw_request)
+            if handler is not None:
+                response = await handler.unload_lora_adapter(request)
+                if isinstance(response, ErrorResponse):
+                    return JSONResponse(content=response.model_dump(),
+                                        status_code=response.code)
+
+        return Response(status_code=200, content=response)
+
+
+def build_app(args: Namespace) -> FastAPI:
+    if args.disable_fastapi_docs:
+        app = FastAPI(openapi_url=None,
+                      docs_url=None,
+                      redoc_url=None,
+                      lifespan=lifespan)
+    else:
+        app = FastAPI(lifespan=lifespan)
+    app.include_router(router)
+    app.root_path = args.root_path
+    app.count = 0
+
+    mount_metrics(app)
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=args.allowed_origins,
+        allow_credentials=args.allow_credentials,
+        allow_methods=args.allowed_methods,
+        allow_headers=args.allowed_headers,
+    )
+
+    @app.exception_handler(RequestValidationError)
+    async def validation_exception_handler(_, exc):
+        err = ErrorResponse(message=str(exc),
+                            type="BadRequestError",
+                            code=HTTPStatus.BAD_REQUEST)
+        return JSONResponse(err.model_dump(),
+                            status_code=HTTPStatus.BAD_REQUEST)
+
+    if token := envs.VLLM_API_KEY or args.api_key:
+
+        @app.middleware("http")
+        async def authentication(request: Request, call_next):
+            if request.method == "OPTIONS":
+                return await call_next(request)
+            url_path = request.url.path
+            if app.root_path and url_path.startswith(app.root_path):
+                url_path = url_path[len(app.root_path):]
+            if not url_path.startswith("/v1"):
+                return await call_next(request)
+            if request.headers.get("Authorization") != "Bearer " + token:
+                return JSONResponse(content={"error": "Unauthorized"},
+                                    status_code=401)
+            return await call_next(request)
+
+    @app.middleware("http")
+    async def add_request_id(request: Request, call_next):
+        request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
+        response = await call_next(request)
+        response.headers["X-Request-Id"] = request_id
+        return response
+
+    for middleware in args.middleware:
+        module_path, object_name = middleware.rsplit(".", 1)
+        imported = getattr(importlib.import_module(module_path), object_name)
+        if inspect.isclass(imported):
+            app.add_middleware(imported)
+        elif inspect.iscoroutinefunction(imported):
+            app.middleware("http")(imported)
+        else:
+            raise ValueError(f"Invalid middleware {middleware}. "
+                             f"Must be a function or a class.")
 
     return app
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a4f9f6b9af536..46c820fb5a794 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -6,7 +6,6 @@
 
 from fastapi import Request
 
-from pyinstrument import Profiler
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
@@ -73,7 +72,6 @@ async def create_completion(
             - suffix (the language models we currently support do not support
             suffix)
         """
-
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -273,9 +271,8 @@ async def completion_stream_generator(
             such that the API server falls behind, we dynamically fall back
             to streaming chunks of tokens.
         """
-        print(f"GENERATOR: {should_profile=}")
-        should_profile=False
         if should_profile:
+            from pyinstrument import Profiler
             print("STARTING PROFILER")
             profiler = Profiler(async_mode="disabled")
             profiler.start()
@@ -307,7 +304,7 @@ async def completion_stream_generator(
 
                 if res.finished and should_profile:
                     profiler.stop()
-                    profiler.write_html("vllm-proxy.html")
+                    profiler.write_html("task-disabled.html")
 
                 # Prompt details are excluded from later streamed outputs
                 if res.prompt_token_ids is not None:

From 4571da6bb9ddaa7c536ba99d492e101524183ae6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 21:34:14 +0000
Subject: [PATCH 040/132] updated

---
 vllm/entrypoints/openai/api_server.py         | 17 ++--
 vllm/entrypoints/openai/protocol.py           |  9 +-
 vllm/entrypoints/openai/serving_completion.py | 12 ---
 vllm/v1/engine/async_llm.py                   | 89 ++++---------------
 vllm/v1/engine/core.py                        |  4 +-
 vllm/v1/engine/detokenizer.py                 |  6 +-
 6 files changed, 33 insertions(+), 104 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f301ada394000..c59c075e0ae2e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -374,15 +374,12 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 @router.post("/v1/completions")
 async def create_completion(request: CompletionRequest, raw_request: Request):
-    raw_request.app.count += 1
-    should_profile = raw_request.app.count == 500
     handler = completion(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Completions API")
 
-    generator = await handler.create_completion(request, raw_request, 
-                                                should_profile=should_profile)
+    generator = await handler.create_completion(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -533,12 +530,12 @@ async def authentication(request: Request, call_next):
                                     status_code=401)
             return await call_next(request)
 
-    @app.middleware("http")
-    async def add_request_id(request: Request, call_next):
-        request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
-        response = await call_next(request)
-        response.headers["X-Request-Id"] = request_id
-        return response
+    # @app.middleware("http")
+    # async def add_request_id(request: Request, call_next):
+    #     request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
+    #     response = await call_next(request)
+    #     response.headers["X-Request-Id"] = request_id
+    #     return response
 
     for middleware in args.middleware:
         module_path, object_name = middleware.rsplit(".", 1)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index cfc02013dd8c5..8bae4dadbe625 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -38,11 +38,6 @@
 assert _LONG_INFO.min == _MOCK_LONG_INFO.min
 assert _LONG_INFO.max == _MOCK_LONG_INFO.max
 
-STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.DELTA
-if VLLM_USE_V1:
-    STREAM_SAMPLING_OUTPUT_KIND = RequestOutputKind.CUMULATIVE
-
-
 class OpenAIBaseModel(BaseModel):
     # OpenAI API does allow extra fields
     model_config = ConfigDict(extra="allow")
@@ -427,7 +422,7 @@ def to_sampling_params(
                                                     logits_processor_pattern),
             include_stop_str_in_output=self.include_stop_str_in_output,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            output_kind=STREAM_SAMPLING_OUTPUT_KIND if self.stream \
+            output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias)
@@ -742,7 +737,7 @@ def to_sampling_params(
             logits_processors=get_logits_processors(self.logits_processors,
                                                     logits_processor_pattern),
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            output_kind=STREAM_SAMPLING_OUTPUT_KIND if self.stream \
+            output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 46c820fb5a794..19c6f59cdd2aa 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -61,7 +61,6 @@ async def create_completion(
         self,
         request: CompletionRequest,
         raw_request: Request,
-        should_profile: bool=False
     ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
         """Completion API similar to OpenAI's API.
 
@@ -189,7 +188,6 @@ async def create_completion(
                 tokenizer=tokenizer,
                 request_metadata=request_metadata,
                 output_kind=sampling_params.output_kind,
-                should_profile=should_profile,
             )
 
         # Non-streaming response
@@ -249,7 +247,6 @@ async def completion_stream_generator(
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
         output_kind: RequestOutputKind,
-        should_profile: bool = False,
     ) -> AsyncGenerator[str, None]:
         """
         In V0, we use RequestOutputType.DELTA and each RequestOutput
@@ -271,11 +268,6 @@ async def completion_stream_generator(
             such that the API server falls behind, we dynamically fall back
             to streaming chunks of tokens.
         """
-        if should_profile:
-            from pyinstrument import Profiler
-            print("STARTING PROFILER")
-            profiler = Profiler(async_mode="disabled")
-            profiler.start()
 
         assert (output_kind == RequestOutputKind.CUMULATIVE
                 or output_kind == RequestOutputKind.DELTA)
@@ -302,10 +294,6 @@ async def completion_stream_generator(
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt
 
-                if res.finished and should_profile:
-                    profiler.stop()
-                    profiler.write_html("task-disabled.html")
-
                 # Prompt details are excluded from later streamed outputs
                 if res.prompt_token_ids is not None:
                     num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 09ee89e645a44..d53649fb0ffb2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -28,29 +28,6 @@
 import uvloop
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
-@dataclass
-class RequestState:
-    """
-    RequestState manages concurrency between:
-        * the output_handler(), which pulls outputs from EngineCore
-        * the per-request generate(), which yields to the API server
-
-    The output_handler adds new RequestOutputs to out_list and sets the 
-    asyncio event, notifying the generate() that there is work to do.
-
-    generate() waits on the asyncio event and yields the data from 
-    out_list back to the caller generate()
-    """
-
-    prompt: str
-    prompt_token_ids: List[int]
-    event: asyncio.Event
-    out_list: List[RequestOutput]
-
-    @classmethod
-    def new(cls, prompt, prompt_token_ids) -> "RequestState":
-        return cls(prompt, prompt_token_ids, asyncio.Event(), [])
-
 
 class AsyncLLM(EngineClient):
 
@@ -82,8 +59,8 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # RequestId -> RequestState.
-        self.rid_to_state: Dict[str, RequestState] = {}
+        # RequestId -> OutputQueue.
+        self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {}
         # List of cancelled request ids to be aborted.
         self.client_aborted_requests: List[str] = []
 
@@ -186,7 +163,7 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> RequestState:
+    ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
@@ -196,8 +173,7 @@ async def add_request(
         
         # 1) Add to RequestState tracker. The "event" is used to manage
         # concurrency between generate() and output_handler().
-        self.rid_to_state[request_id] = RequestState.new(prompt,
-                                                         engine_core_req.prompt_token_ids)
+        self.rid_to_queue[request_id] = asyncio.Queue()
 
         # 3) Add the DetokenizerRequest to Detokenizer.
         # TODO: sending these separately is a race condition. We should instead
@@ -207,7 +183,7 @@ async def add_request(
         # 4) Add the EngineCoreRequest to EngineCore.
         await self.engine_core.add_request_async(engine_core_req)
 
-        return self.rid_to_state[request_id]
+        return self.rid_to_queue[request_id]
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -238,10 +214,6 @@ async def generate(
         the latest RequestOutput back to the caller. 
         """
 
-        # DELTA streaming is not supported due to dynamic chunking.
-        assert (sampling_params.output_kind == RequestOutputKind.CUMULATIVE or
-                sampling_params.output_kind == RequestOutputKind.FINAL_ONLY)
-
         # We start the output_handler on the first call to generate() so that
         # we can call __init__ before the event loop starts, which enables us
         # to handle startup failure gracefully in the OpenAI server.
@@ -259,7 +231,7 @@ def signal_handler(self, signum=None, frame=None):
             loop.create_task(self._run_output_handler())
             loop.add_signal_handler(signal.SIGTERM, signal_handler)
 
-        state = await self.add_request(
+        queue = await self.add_request(
             request_id,
             prompt,
             sampling_params,
@@ -271,33 +243,22 @@ def signal_handler(self, signum=None, frame=None):
 
         while True:
             try:
-                await asyncio.wait_for(state.event.wait(), timeout=4)
-
-                # NOTE(rob): out_list can have more than one item. However,
-                # in the streaming case, we use RequestOutputKind.CUMULATIVE,
-                # which has the full generated text output (not just the text
-                # corresponding to the last token). So, we can just send the
-                # last RequestOutput and the API Client handles converting into
-                # a delta text. This way we do "dynamic chunked streaming", such
-                # that the API client does not fall behind the EngineCor,
-                # which happens at high QPS otherwise.
-                out = state.out_list[-1]
-                if len(state.out_list) > 1:
-                    logger.info(f"{len(state.out_list)=}")
+                out = await asyncio.wait_for(queue.get(), timeout=4)
+                if out.finished:
+                    del self.rid_to_queue[request_id]
+                    yield out
+                    break
+
+                yield out
 
             except asyncio.TimeoutError:
                 # TODO(rob): do request cancellation checking here.
                 # logger.debug("Timeout waiting for %s", request_id)
                 continue
-                
-            state.out_list = []
-            if out.finished:
-                del self.rid_to_state[request_id]
-                yield out
-                break
 
-            state.event.clear()
-            yield out
+        
+
+                
 
     # async def _process_cancellations(self) -> None:
     #     """
@@ -331,30 +292,18 @@ def signal_handler(self, signum=None, frame=None):
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
-        epoch = 0
 
         while True:
-            logger.info(f"EPOCH: {epoch}")
-            epoch += 1
-            # self.warned = False
-            # if self.epoch % 10 == 0:
-            #     logger.info(f"\n{self.epoch=}\n")
 
             # 1) Pull outputs from the Detokenizer.
-            outputs = await self.detokenizer.output_socket.recv_pyobj()
+            outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj()
 
             for out in outputs:
-                if out.request_id not in self.rid_to_state:
+                if out.request_id not in self.rid_to_queue:
                     raise RuntimeError(f"{out.request_id} "
                                         "not in RequestStates")
 
-                state = self.rid_to_state[out.request_id]
-
-                # Update the RequestState and alert generate() that there
-                # is a RequestOutput ready to return to the user.
-                
-                state.out_list.append(out)
-                state.event.set()
+                self.rid_to_queue[out.request_id].put_nowait(out)
 
             # 3) Abort any requests that finished due to stop strings.
             # await self.engine_core.abort_requests_async(reqs_to_abort)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f8aef10908514..a3f294bffa064 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -247,8 +247,8 @@ def run_busy_loop(self):
         # Loop until process is sent a SIGINT or SIGTERM
         epoch = 0
         while True:
-            logger.info(f"EPOCH: {epoch}")
-            epoch += 1
+            # logger.info(f"EPOCH: {epoch}")
+            # epoch += 1
 
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 8db857bca9b72..ac6ec1ce44deb 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -128,7 +128,7 @@ def from_eco(
             read_offset=read_offset,
             skip_special_tokens=True,
             spaces_between_special_tokens=True,
-            output_kind=RequestOutputKind.CUMULATIVE,
+            output_kind=RequestOutputKind.DELTA,
             request_id=eco.request_id,
             prompt=eco.prompt,
             prompt_token_ids=eco.prompt_token_ids,
@@ -458,8 +458,8 @@ def run_busy_loop(self):
 
                 epoch = 0
                 while True:
-                    logger.info(f"EPOCH: {epoch}")
-                    epoch += 1
+                    # logger.info(f"EPOCH: {epoch}")
+                    # epoch += 1
 
                     socks = dict(poller.poll())
 

From c5dacd4a2b81f4e6e7e7de184d5bf4100bd642e3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 21:35:51 +0000
Subject: [PATCH 041/132] remove output kind from api server

---
 vllm/entrypoints/openai/serving_completion.py | 48 ++-----------------
 1 file changed, 5 insertions(+), 43 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 19c6f59cdd2aa..e9a5a55079ca4 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -187,7 +187,6 @@ async def create_completion(
                 num_prompts=num_prompts,
                 tokenizer=tokenizer,
                 request_metadata=request_metadata,
-                output_kind=sampling_params.output_kind,
             )
 
         # Non-streaming response
@@ -246,32 +245,7 @@ async def completion_stream_generator(
         num_prompts: int,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        output_kind: RequestOutputKind,
     ) -> AsyncGenerator[str, None]:
-        """
-        In V0, we use RequestOutputType.DELTA and each RequestOutput
-            from the result_generator is guaranteed to correspond to
-            a single token so can construct the outputs without needing
-            to maintain any state.
-
-        In V1, we use RequestOutputType.CUMULATIVE and each RequestOutput
-            from the result_generator is not guaranteed to correspond to
-            a single token (it could correspond to 2+ tokens).
-
-            To handle this, we need to maintain state around how many
-            characters and tokens have been returned so far, and dynamically
-            stream back just the delta (where the delta could be the text
-            corresponding to N tokens).
-
-            We do this to dynamically adjust how much work the API server
-            is doing. If the QPS is high and streaming becomes a bottleneck,
-            such that the API server falls behind, we dynamically fall back
-            to streaming chunks of tokens.
-        """
-
-        assert (output_kind == RequestOutputKind.CUMULATIVE
-                or output_kind == RequestOutputKind.DELTA)
-
         num_choices = 1 if request.n is None else request.n
         previous_text_lens = [0] * num_choices * num_prompts
         previous_num_tokens = [0] * num_choices * num_prompts
@@ -327,17 +301,9 @@ async def completion_stream_generator(
                             ]
                         has_echoed[i] = True
                     else:
-                        if output_kind == RequestOutputKind.CUMULATIVE:
-                            delta_text = output.text[previous_text_lens[i]:]
-                            delta_token_ids = output.token_ids[
-                                previous_num_tokens[i]:]
-                            out_logprobs = (
-                                output.logprobs[previous_num_tokens[i]:]
-                                if output.logprobs else None)
-                        else:
-                            delta_text = output.text
-                            delta_token_ids = output.token_ids
-                            out_logprobs = output.logprobs
+                        delta_text = output.text
+                        delta_token_ids = output.token_ids
+                        out_logprobs = output.logprobs
 
                         if not delta_text and not delta_token_ids \
                             and not previous_num_tokens[i]:
@@ -357,12 +323,8 @@ async def completion_stream_generator(
                     else:
                         logprobs = None
 
-                    if output_kind == RequestOutputKind.CUMULATIVE:
-                        previous_text_lens[i] = len(output.text)
-                        previous_num_tokens[i] = len(output.token_ids)
-                    else:
-                        previous_text_lens[i] += len(output.text)
-                        previous_num_tokens[i] += len(output.token_ids)
+                    previous_text_lens[i] += len(output.text)
+                    previous_num_tokens[i] += len(output.token_ids)
 
                     finish_reason = output.finish_reason
                     stop_reason = output.stop_reason

From 23d3e60ceacd0c06e41d4781a6424bd8290a780e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 21:58:16 +0000
Subject: [PATCH 042/132] updated

---
 benchmarks/backend_request_func.py |  4 +---
 benchmarks/benchmark_throughput.py | 19 ++++++++-----------
 vllm/v1/engine/async_llm.py        |  6 +++++-
 vllm/v1/engine/core.py             |  4 ++--
 vllm/v1/engine/detokenizer.py      |  5 ++---
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 1374768dc3def..b67849038cf0d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -242,9 +242,7 @@ async def async_request_openai_completions(
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
-            # "ignore_eos": request_func_input.ignore_eos,
-            "ignore_eos": False,
-
+            "ignore_eos": request_func_input.ignore_eos,
         }
         if request_func_input.extra_body:
             payload.update(request_func_input.extra_body)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 12ee9798d827e..69e82099e4506 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -198,27 +198,21 @@ async def run_vllm_async(
                     max_tokens=request.expected_output_len,
                 ))
 
+        async def run(generator):
+            async for res in generator:
+                pass
+
         tasks = []
         start = time.perf_counter()
         for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
             generator = llm.generate(prompt, sp, request_id=f"test{i}")
             tasks.append(run(generator))
-        # all_gens = merge_async_iterators(*generators)
-        # async for i, res in all_gens:
-        #     pass
 
-        from aiodebug import log_slow_callbacks
-        loop = asyncio.get_event_loop()
-        log_slow_callbacks.enable(0.05)
         await asyncio.gather(*tasks)
 
         end = time.perf_counter()
         return end - start
 
-async def run(generator):
-    async for res in generator:
-        pass
-
 def run_hf(
     requests: List[SampleRequest],
     model: str,
@@ -371,7 +365,10 @@ def main(args: argparse.Namespace):
         # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
+          f"{(total_num_tokens - total_output_tokens) / len(requests)} input tokens/req, "
+          f"{(total_output_tokens) / len(requests)} output tokens/req, "
+    )
 
     # Output JSON results if specified
     if args.output_json:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d53649fb0ffb2..f998aab95e4dd 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -92,7 +92,6 @@ def __init__(
 
         # self.output_handler: Optional[asyncio.Task] = None
         self.to_create_loop = True
-        self.epoch = 0
 
     def __del__(self):
         self.shutdown()
@@ -244,6 +243,8 @@ def signal_handler(self, signum=None, frame=None):
         while True:
             try:
                 out = await asyncio.wait_for(queue.get(), timeout=4)
+
+                logger.info(f"{queue.qsize()=}")
                 if out.finished:
                     del self.rid_to_queue[request_id]
                     yield out
@@ -293,7 +294,10 @@ def signal_handler(self, signum=None, frame=None):
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
+        epoch = 0
         while True:
+            logger.info(f"EPOCH: {epoch}")
+            epoch+=1
 
             # 1) Pull outputs from the Detokenizer.
             outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a3f294bffa064..f8aef10908514 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -247,8 +247,8 @@ def run_busy_loop(self):
         # Loop until process is sent a SIGINT or SIGTERM
         epoch = 0
         while True:
-            # logger.info(f"EPOCH: {epoch}")
-            # epoch += 1
+            logger.info(f"EPOCH: {epoch}")
+            epoch += 1
 
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index ac6ec1ce44deb..92fac83c4455a 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -231,7 +231,6 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
             return self.output_text[last_offset:length]
         return ""
 
-import time
 class Detokenizer:
 
     def __init__(self,
@@ -458,8 +457,8 @@ def run_busy_loop(self):
 
                 epoch = 0
                 while True:
-                    # logger.info(f"EPOCH: {epoch}")
-                    # epoch += 1
+                    logger.info(f"EPOCH: {epoch}")
+                    epoch += 1
 
                     socks = dict(poller.poll())
 

From 3acf5c261f9b2682f6f6dbcd7ffd340a02e4a4ac Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 21:59:39 +0000
Subject: [PATCH 043/132] cleanup

---
 benchmarks/benchmark_throughput.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 69e82099e4506..0695a8579c49c 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,6 +6,7 @@
 import time
 from typing import List, Optional
 
+import asyncio
 import torch
 import uvloop
 from PIL import Image
@@ -170,7 +171,6 @@ def run_vllm(
         end = time.perf_counter()
     return end - start
 
-import asyncio
 async def run_vllm_async(
     requests: List[SampleRequest],
     n: int,
@@ -334,8 +334,7 @@ def main(args: argparse.Namespace):
                          for request in requests)
     if args.backend == "vllm":
         if args.async_engine:
-            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-            elapsed_time = asyncio.run(
+            elapsed_time = uvloop.run(
                 run_vllm_async(
                     requests,
                     args.n,

From 84ff3c2e1cfe5d8709bf50bfd231c2b339fa89c2 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:00:55 +0000
Subject: [PATCH 044/132] cleanup

---
 benchmarks/benchmark_throughput.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 0695a8579c49c..dac6dcc959a94 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -171,6 +171,7 @@ def run_vllm(
         end = time.perf_counter()
     return end - start
 
+
 async def run_vllm_async(
     requests: List[SampleRequest],
     n: int,
@@ -198,21 +199,19 @@ async def run_vllm_async(
                     max_tokens=request.expected_output_len,
                 ))
 
-        async def run(generator):
-            async for res in generator:
-                pass
-
-        tasks = []
+        generators = []
         start = time.perf_counter()
         for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
             generator = llm.generate(prompt, sp, request_id=f"test{i}")
-            tasks.append(run(generator))
-
-        await asyncio.gather(*tasks)
-
+            generators.append(generator)
+        
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
         end = time.perf_counter()
         return end - start
 
+
 def run_hf(
     requests: List[SampleRequest],
     model: str,

From ddf14264b15c67784aabbfa9c7bf723bbc5ea098 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:02:02 +0000
Subject: [PATCH 045/132] updated

---
 benchmarks/benchmark_throughput.py    | 2 --
 vllm/entrypoints/openai/api_server.py | 1 -
 vllm/entrypoints/openai/protocol.py   | 1 -
 3 files changed, 4 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index dac6dcc959a94..16ca509c12d18 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,7 +6,6 @@
 import time
 from typing import List, Optional
 
-import asyncio
 import torch
 import uvloop
 from PIL import Image
@@ -204,7 +203,6 @@ async def run_vllm_async(
         for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
             generator = llm.generate(prompt, sp, request_id=f"test{i}")
             generators.append(generator)
-        
         all_gens = merge_async_iterators(*generators)
         async for i, res in all_gens:
             pass
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index c59c075e0ae2e..090c610d3a008 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -494,7 +494,6 @@ def build_app(args: Namespace) -> FastAPI:
         app = FastAPI(lifespan=lifespan)
     app.include_router(router)
     app.root_path = args.root_path
-    app.count = 0
 
     mount_metrics(app)
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 8bae4dadbe625..ff7bbb8da80cc 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -10,7 +10,6 @@
 from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-from vllm.envs import VLLM_USE_V1
 from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,

From 895fd0d38cf446365b17455b3e06449c2462587c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:02:50 +0000
Subject: [PATCH 046/132] updated

---
 vllm/v1/engine/async_llm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index f998aab95e4dd..10a19007be117 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -244,7 +244,9 @@ def signal_handler(self, signum=None, frame=None):
             try:
                 out = await asyncio.wait_for(queue.get(), timeout=4)
 
-                logger.info(f"{queue.qsize()=}")
+                q_size = queue.qsize()
+                if len(q_size) > 0:
+                    logger.info(f"{q_size}")
                 if out.finished:
                     del self.rid_to_queue[request_id]
                     yield out

From 1184615cc632a20cff31b5d431ec2c756074627b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:03:26 +0000
Subject: [PATCH 047/132] cleanup

---
 vllm/entrypoints/openai/protocol.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index ff7bbb8da80cc..6ed7c2e9dcd6b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -37,6 +37,7 @@
 assert _LONG_INFO.min == _MOCK_LONG_INFO.min
 assert _LONG_INFO.max == _MOCK_LONG_INFO.max
 
+
 class OpenAIBaseModel(BaseModel):
     # OpenAI API does allow extra fields
     model_config = ConfigDict(extra="allow")

From 7da9b1ad83ee9a8e34b0ef9f9b50aa72eccfaf73 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:04:01 +0000
Subject: [PATCH 048/132] cleanup

---
 vllm/entrypoints/openai/serving_completion.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index e9a5a55079ca4..607c30b55ea45 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -27,8 +27,7 @@
                                                     PromptAdapterPath)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
-                                  SamplingParams)
+from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import merge_async_iterators
@@ -174,7 +173,6 @@ async def create_completion(
         stream = (request.stream
                   and (request.best_of is None or request.n == request.best_of)
                   and not request.use_beam_search)
-        assert isinstance(sampling_params, SamplingParams)
 
         # Streaming response
         if stream:
@@ -186,8 +184,7 @@ async def create_completion(
                 model_name,
                 num_prompts=num_prompts,
                 tokenizer=tokenizer,
-                request_metadata=request_metadata,
-            )
+                request_metadata=request_metadata)
 
         # Non-streaming response
         final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts

From f61c26a167c4021325d3400eddfbb6804f97beef Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:04:43 +0000
Subject: [PATCH 049/132] cleanup

---
 vllm/entrypoints/openai/serving_completion.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 607c30b55ea45..72b98dab345a3 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -319,7 +319,6 @@ async def completion_stream_generator(
                         )
                     else:
                         logprobs = None
-
                     previous_text_lens[i] += len(output.text)
                     previous_num_tokens[i] += len(output.token_ids)
 
@@ -379,7 +378,6 @@ async def completion_stream_generator(
             yield f"data: {data}\n\n"
         yield "data: [DONE]\n\n"
 
-
     def request_output_to_completion_response(
         self,
         final_res_batch: List[RequestOutput],

From 4e3de90c40b0f1799db4333dd911ed2138984e31 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:05:57 +0000
Subject: [PATCH 050/132] updated

---
 vllm/v1/engine/__init__.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index f81869a46b837..1482505e88490 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -35,31 +35,6 @@ class DetokenizerRequest(
     include_stop_str_in_output: bool
 
 
-class DetokenizerOutput(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
-
-    request_id: str
-    text: str
-    token_ids: List[int]
-    finished: bool
-
-
-class DetokenizerOutputs(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
-
-    #NOTE(Nick): We could consider ways to make this more compact,
-    # e.g. columnwise layout and using an int enum for finish/stop reason
-
-    # [num_reqs]
-    outputs: List[DetokenizerOutput]
-
-
 @dataclass
 class EngineCoreRequest:
 

From 07e4fa2d2b6d05a32f2257bc8c2b22ee946dfdf7 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:06:52 +0000
Subject: [PATCH 051/132] updated

---
 vllm/v1/engine/core.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f8aef10908514..ebfa9b14d90e7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -338,13 +338,8 @@ def process_output_socket(self, output_path: str):
         buffer = bytearray()
 
         with zmq_socket_ctx(output_path, zmq.PUSH) as socket:
-            idx = 0
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
                 encoder.encode_into(outputs, buffer)
-                # msg = (DetokenizerRequestType.OUT.value, buffer)
-                msg = (buffer, )
-                # logger.info(f"SEND: {idx}: {len(engine_core_outputs)}")
-                # idx += 1
-                socket.send_multipart(msg, copy=False)
+                socket.send_multipart((buffer,), copy=False)

From 2022a4f4f8a55485dc73cb3fd50daac41eb3cc64 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:09:46 +0000
Subject: [PATCH 052/132] format

---
 vllm/outputs.py               | 24 ------------------------
 vllm/v1/engine/__init__.py    |  1 -
 vllm/v1/engine/async_llm.py   |  2 +-
 vllm/v1/engine/detokenizer.py |  6 ++----
 4 files changed, 3 insertions(+), 30 deletions(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index 9a4b4353deb1d..b2f869b862a6a 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -12,7 +12,6 @@
 from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
                            SequenceGroup, SequenceGroupBase, SequenceStatus)
-from vllm.v1.engine import DetokenizerOutput
 
 
 @dataclass
@@ -133,29 +132,6 @@ def __init__(
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
         self.num_cached_tokens = num_cached_tokens
 
-    @classmethod
-    def from_detok(
-        cls,
-        prompt: str,
-        prompt_token_ids: List[int],
-        detok_output: DetokenizerOutput,
-    ):
-        completion_output = CompletionOutput(
-            index=0,
-            text=detok_output.text,
-            token_ids=detok_output.token_ids,
-            cumulative_logprob=None,
-            logprobs=None,  # TODO
-        )
-
-        return RequestOutput(
-            request_id=detok_output.request_id,
-            prompt=prompt,
-            prompt_token_ids=prompt_token_ids,
-            prompt_logprobs=None,  # TODO
-            outputs=[completion_output],
-            finished=detok_output.finished,
-        )
 
     @classmethod
     def new(
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 1482505e88490..367aee130cd75 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -69,7 +69,6 @@ class EngineCoreOutput(
     finished: bool
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
-    
 
 
 class EngineCoreOutputs(
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 10a19007be117..003db30adbc12 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -245,7 +245,7 @@ def signal_handler(self, signum=None, frame=None):
                 out = await asyncio.wait_for(queue.get(), timeout=4)
 
                 q_size = queue.qsize()
-                if len(q_size) > 0:
+                if q_size > 0:
                     logger.info(f"{q_size}")
                 if out.finished:
                     del self.rid_to_queue[request_id]
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 92fac83c4455a..db65d30f443b1 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -14,10 +14,8 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
-from vllm.v1.engine import (DetokenizerRequest, DetokenizerOutputs,
-                            DetokenizerOutput,
-                            EngineCoreOutput, EngineCoreOutputs, 
-                            BackgroundProcHandle,)
+from vllm.v1.engine import (DetokenizerRequest, EngineCoreOutput,
+                            EngineCoreOutputs, BackgroundProcHandle,)
 from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, 
                            wait_for_startup)
 from vllm.v1.serial_utils import PickleEncoder

From 10c7092c637cac12e87ef1505227545e5d7a5c7c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:10:10 +0000
Subject: [PATCH 053/132] updated

---
 vllm/v1/engine/detokenizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index db65d30f443b1..89757501fe919 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -287,12 +287,11 @@ def add_request_eco(
         
     def step(
         self, encore_core_outputs: List[EngineCoreOutput]
-    ) -> DetokenizerOutputs:
+    ) -> List[RequestOutput]:
         """Update state and request the RequestOutputs to the LLMEngine."""
 
         request_outputs: List[RequestOutput] = []
         # requests_to_abort: List[str] = []
-        # detokenizer_outputs = DetokenizerOutputs(outputs=[])
 
         for engine_core_output in encore_core_outputs:
             request_id = engine_core_output.request_id

From a0620ac457e8157d70c8de11834d4f465ea3464d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:10:39 +0000
Subject: [PATCH 054/132] cleanup

---
 vllm/v1/engine/detokenizer.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 89757501fe919..1a19d99b7d6bc 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -485,8 +485,6 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
 
         # Serialization setup.
         self.encoder = msgspec.msgpack.Encoder()
-        # self.decoder = PickleEncoder()
-        self.decoder = msgspec.msgpack.Decoder(DetokenizerOutputs)
         
         # ZMQ setup.
         self.ctx = zmq.asyncio.Context(2)
@@ -533,7 +531,4 @@ async def add_request_async(self, request: DetokenizerRequest):
     async def get_output_async(self) -> List[RequestOutput]:
         """Get RequestOutputs, RequestsToAbort from Detokenizer."""
 
-        # (frame, ) = await self.output_socket.recv_multipart(copy=False)
-        # return self.decoder.decode(frame.buffer)
         return await self.output_socket.recv_pyobj()
-        # return self.decoder.decode(msg)

From 12b3e066d1cce642cac958a8903051d69d8c4233 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:15:11 +0000
Subject: [PATCH 055/132] more cleanup

---
 vllm/entrypoints/openai/serving_completion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 72b98dab345a3..11278c8421c3c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -319,9 +319,9 @@ async def completion_stream_generator(
                         )
                     else:
                         logprobs = None
+
                     previous_text_lens[i] += len(output.text)
                     previous_num_tokens[i] += len(output.token_ids)
-
                     finish_reason = output.finish_reason
                     stop_reason = output.stop_reason
 
@@ -431,6 +431,7 @@ def request_output_to_completion_response(
 
                         output_text = prompt_text + output.text
                 else:
+                    # return just the delta
                     token_ids = output.token_ids
                     out_logprobs = output.logprobs
                     output_text = output.text

From e0926641f6ebda168efb2af64d58e1e80a7d968d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:15:40 +0000
Subject: [PATCH 056/132] more cleanup

---
 vllm/entrypoints/openai/serving_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 11278c8421c3c..d87c410c0124c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -298,6 +298,7 @@ async def completion_stream_generator(
                             ]
                         has_echoed[i] = True
                     else:
+                        # return just the delta
                         delta_text = output.text
                         delta_token_ids = output.token_ids
                         out_logprobs = output.logprobs
@@ -431,7 +432,6 @@ def request_output_to_completion_response(
 
                         output_text = prompt_text + output.text
                 else:
-                    # return just the delta
                     token_ids = output.token_ids
                     out_logprobs = output.logprobs
                     output_text = output.text

From 32df238cdb0dddd06d3c9067ac8c48b119c2f97a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:16:15 +0000
Subject: [PATCH 057/132] updated

---
 vllm/outputs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index b2f869b862a6a..2ecdf74ee59b3 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -132,7 +132,6 @@ def __init__(
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
         self.num_cached_tokens = num_cached_tokens
 
-
     @classmethod
     def new(
         cls,

From 7dff86359632c98f30f0d0d5a5cca6d20f9b03cf Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:16:52 +0000
Subject: [PATCH 058/132] more cleanup

---
 vllm/v1/engine/__init__.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 367aee130cd75..c95bc22a9aaab 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -89,15 +89,6 @@ class EngineCoreProfile:
     is_start: bool
 
 
-class DetokenizerRequestType(enum.Enum):
-    """
-    Request types defined as hex byte strings, so it can be sent over sockets
-    without separate encoding step.
-    """
-    NEW = b'\x00'
-    OUT = b'\x01'
-
-
 class EngineCoreRequestType(enum.Enum):
     """
     Request types defined as hex byte strings, so it can be sent over sockets

From 103729d5b186cd1307d226eaf21622f6bf94b150 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:41:21 +0000
Subject: [PATCH 059/132] updated

---
 vllm/v1/engine/__init__.py    | 27 ++----------------
 vllm/v1/engine/async_llm.py   | 32 ++++++++-------------
 vllm/v1/engine/core.py        |  5 ++--
 vllm/v1/engine/detokenizer.py | 54 +++++++++++++++--------------------
 vllm/v1/engine/processor.py   | 20 +++----------
 5 files changed, 43 insertions(+), 95 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index c95bc22a9aaab..df0001679e555 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -18,33 +18,10 @@ class BackgroundProcHandle:
     output_path: str
 
 
-class DetokenizerRequest(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
-
-    request_id: str
-    prompt: Optional[str]
-    prompt_token_ids: List[int]
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
-    output_kind: RequestOutputKind
-
-    stop: List[str]
-    include_stop_str_in_output: bool
-
-
 @dataclass
-class EngineCoreRequest:
-
-    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
-    # but this object is currently not playing well with msgspec
-    # due to circular imports and typing we have in data.py
+class EngineRequest:
 
     request_id: str
-    #NOTE(Nick): I don't think we need to pass prompt here since it should
-    # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
     mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
@@ -99,4 +76,4 @@ class EngineCoreRequestType(enum.Enum):
     PROFILE = b'\x02'
 
 
-EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]]
+EngineCoreRequestUnion = Union[EngineRequest, EngineCoreProfile, List[str]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 003db30adbc12..8dcf8de1a092e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -13,7 +13,7 @@
 from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
@@ -165,22 +165,16 @@ async def add_request(
     ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
-        _, engine_core_req = self.processor.process_inputs(
+        # 1) Convert Input --> EngineRequest.
+        engine_request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
         
-        # 1) Add to RequestState tracker. The "event" is used to manage
-        # concurrency between generate() and output_handler().
+        # 2) Create Queue (output_handler pushes, generate pulls)
         self.rid_to_queue[request_id] = asyncio.Queue()
 
-        # 3) Add the DetokenizerRequest to Detokenizer.
-        # TODO: sending these separately is a race condition. We should instead
-        # have the EngineCore do the "AddRequest" logic.
-        # await self.detokenizer.add_request_async(detokenizer_req)
-
-        # 4) Add the EngineCoreRequest to EngineCore.
-        await self.engine_core.add_request_async(engine_core_req)
+        # 3) Send to Detokenizer.
+        await self.detokenizer.add_request_async(engine_request)
 
         return self.rid_to_queue[request_id]
 
@@ -201,16 +195,15 @@ async def generate(
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
-            * 1) Make RequestState corresponding to the Request.
+            * 1) Make a queue corresponding to the Request.
             # 2) Processing the Input.
-            * 3) Adding the Request to the Detokenizer.
-            * 4) Adding the Request to the EngineCore (separate process).
+            * 3) Adding the Request to the Detokenize + EngineCore.
 
-        The output_handler() loop runs in a background task, pulling from
-        EngineCore and updating the RequestState and setting the asyncio event.
+        The output_handler() loop runs in a background task, pulling
+        from Detokenizer and pushing to the per request queue.
 
-        The caller of generate() waits on the asyncio event and forwards
-        the latest RequestOutput back to the caller. 
+        The generate() pulls from the per requests queue and yeilds
+        to the caller which iterates the AsyncGenerator.
         """
 
         # We start the output_handler on the first call to generate() so that
@@ -218,7 +211,6 @@ async def generate(
         # to handle startup failure gracefully in the OpenAI server.
         # if self.output_handler is None:
         if self.to_create_loop:
-            
             import signal
             def signal_handler(self, signum=None, frame=None):
                 logger.warning(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ebfa9b14d90e7..e35e5ac464305 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -18,9 +18,8 @@
 from vllm.utils import get_open_zmq_ipc_path
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
+                            EngineCoreProfile, EngineRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion,
-                            DetokenizerRequestType,
                             BackgroundProcHandle)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
@@ -89,7 +88,7 @@ def _initialize_kv_caches(self,
                      "warmup model) took %.2f seconds"), elapsed)
         return num_gpu_blocks, num_cpu_blocks
 
-    def add_request(self, request: EngineCoreRequest):
+    def add_request(self, request: EngineRequest):
         """Add request to the scheduler."""
 
         if request.mm_hashes is not None:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 1a19d99b7d6bc..c265cc984dc5e 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -14,11 +14,10 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
-from vllm.v1.engine import (DetokenizerRequest, EngineCoreOutput,
-                            EngineCoreOutputs, BackgroundProcHandle,)
+from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
+                            BackgroundProcHandle, EngineRequest)
 from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, 
                            wait_for_startup)
-from vllm.v1.serial_utils import PickleEncoder
 
 logger = init_logger(__name__)
 
@@ -66,19 +65,20 @@ def output_token_ids(self) -> List[int]:
     def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
-        request: DetokenizerRequest,
+        request: EngineRequest,
     ) -> "IncrementalDetokenizer":
 
+        sampling_params = request.sampling_params
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
             prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.skip_special_tokens,
+            skip_special_tokens=sampling_params.skip_special_tokens,
         )
 
-        stops = request.stop
+        stops = request.sampling_params
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
-        if stops and not request.include_stop_str_in_output:
+        if stops and not sampling_params.include_stop_str_in_output:
             stop_buffer_length = max(len(s) for s in stops) - 1
         else:
             stop_buffer_length = 0
@@ -90,13 +90,13 @@ def from_new_request(
             # NOTE(Nick): could we take ownership of it though?
             token_ids=request.prompt_token_ids.copy(),
             stop=stops,
-            include_stop_str_in_output=request.include_stop_str_in_output,
+            include_stop_str_in_output=sampling_params.include_stop_str_in_output,
             prefix_offset=prefix_offset,
             read_offset=read_offset,
-            skip_special_tokens=request.skip_special_tokens,
-            spaces_between_special_tokens=request.
+            skip_special_tokens=sampling_params.skip_special_tokens,
+            spaces_between_special_tokens=sampling_params.
             spaces_between_special_tokens,
-            output_kind=request.output_kind,
+            output_kind=sampling_params.output_kind,
             request_id=request.request_id,
             prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
@@ -266,7 +266,7 @@ def abort_requests(
 
     def add_request(
         self,
-        request: DetokenizerRequest,
+        request: EngineRequest,
     ):
         """Add new request to the Detokenizer."""
 
@@ -428,29 +428,24 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the Detokenizer."""
 
-        log_interval = 0
-        import time
-
-        last_log = time.perf_counter()
         try:
             # TODO: handle aborted due to client cancellation
             # TODO: pickle -> msgpack
             # TODO: send stop string aborts back to EngineCore directly
 
-            decoder_new = msgspec.msgpack.Decoder(DetokenizerRequest)
+            decoder_new = msgspec.msgpack.Decoder(EngineRequest)
             decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs)
-            encoder = msgspec.msgpack.Encoder()
 
-            with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as engine_core_outputs_socket, 
-                  zmq_socket_ctx(self.input_path, zmq.PULL) as input_socket,
+            with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, 
+                  zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine,
                   zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket):
 
                 # TODO: avoid poll by having both EngineCore
                 # and AsyncLLM send to the same socket (unclear why this 
                 # was not working when I originally tried it)
                 poller = zmq.Poller()
-                poller.register(engine_core_outputs_socket, zmq.POLLIN)
-                poller.register(input_socket, zmq.POLLIN)
+                poller.register(from_engine_core, zmq.POLLIN)
+                poller.register(from_llm_engine, zmq.POLLIN)
 
                 epoch = 0
                 while True:
@@ -460,19 +455,16 @@ def run_busy_loop(self):
                     socks = dict(poller.poll())
 
                     # Handle NewRequest.
-                    if input_socket in socks:
-                        (frame, ) = input_socket.recv_multipart(copy=False)
-                        detokenizer_request = decoder_new.decode(frame.buffer)
-                        self.add_request(detokenizer_request)
+                    if from_llm_engine in socks:
+                        (frame, ) = from_llm_engine.recv_multipart(copy=False)
+                        engine_request = decoder_new.decode(frame.buffer)
+                        self.add_request(engine_request)
 
                     # Handle EngineCoreOutput.
-                    if engine_core_outputs_socket in socks:
-                        (frame, ) = engine_core_outputs_socket.recv_multipart(copy=False)
+                    if from_engine_core in socks:
+                        (frame, ) = from_engine_core.recv_multipart(copy=False)
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
                         request_outputs, _ = self.step(engine_core_outputs)
-                        # msg = encoder.encode(detokenizer_outputs)
-                        # # output_socket.send_multipart((msg, ), copy=False)
-                        # output_socket.send(msg)
                         output_socket.send_pyobj(request_outputs)
         
         except Exception as e:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 679bf8e25e9ca..d37eab3418c3e 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -14,7 +14,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
+from vllm.v1.engine import DetokenizerData, EngineRequest
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
@@ -61,7 +61,7 @@ def process_inputs(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
+    ) -> EngineRequest:
 
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Check max_logprobs
@@ -122,20 +122,8 @@ def process_inputs(
                 decoder_inputs.multi_modal_data, mm_hashes,
                 decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
-        # Make Request for Detokenizer.
-        detokenizer_request = DetokenizerRequest(
-            request_id,
-            decoder_inputs.prompt,
-            decoder_inputs.prompt_token_ids,
-            sampling_params.skip_special_tokens,
-            sampling_params.spaces_between_special_tokens,
-            sampling_params.output_kind,
-            sampling_params.stop,
-            sampling_params.include_stop_str_in_output,
-        )
-
         # Make Request for EngineCore.
-        engine_core_request = EngineCoreRequest(
+        engine_request = EngineRequest(
             request_id,
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
@@ -148,7 +136,7 @@ def process_inputs(
             lora_request,
         )
 
-        return detokenizer_request, engine_core_request
+        return engine_request
 
     def _validate_model_inputs(self, inputs: ProcessorInputs):
         if is_encoder_decoder_inputs(inputs):

From 380086c0b565403a5489bf489ffa560901384254 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 22:47:01 +0000
Subject: [PATCH 060/132] updated

---
 vllm/v1/engine/detokenizer.py | 47 ++---------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index c265cc984dc5e..91b5f7a9c1800 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -104,37 +104,6 @@ def from_new_request(
             stop_buffer_length=stop_buffer_length,
         )
 
-    @classmethod
-    def from_eco(
-        cls,
-        tokenizer: AnyTokenizer,
-        eco: EngineCoreOutput,
-    ):
-        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
-            tokenizer=tokenizer,
-            prompt_ids=eco.prompt_token_ids,
-            skip_special_tokens=True,
-        )
-
-        return cls(
-            output_text="",
-            tokens=tokens,
-            token_ids=eco.prompt_token_ids,
-            stop=[],
-            include_stop_str_in_output=False,
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=True,
-            spaces_between_special_tokens=True,
-            output_kind=RequestOutputKind.DELTA,
-            request_id=eco.request_id,
-            prompt=eco.prompt,
-            prompt_token_ids=eco.prompt_token_ids,
-            tokenizer=tokenizer,
-            stop_buffer_length=0,
-        )
-        
-
     def add_tokens(
         self,
         new_token_ids: List[int],
@@ -275,14 +244,6 @@ def add_request(
         request_state = IncrementalDetokenizer.from_new_request(
             self.tokenizer, request)
         self.request_states[request.request_id] = request_state
-
-    def add_request_eco(
-        self,
-        eco: EngineCoreOutput,
-    ):
-        request_state = IncrementalDetokenizer.from_eco(
-            self.tokenizer, eco)
-        self.request_states[eco.request_id] = request_state
         
         
     def step(
@@ -296,9 +257,6 @@ def step(
         for engine_core_output in encore_core_outputs:
             request_id = engine_core_output.request_id
 
-            if request_id not in self.request_states:
-                self.add_request_eco(engine_core_output)
-
             detokenizer = self.request_states.get(request_id)
             if detokenizer is None:
                 # Ignore output for already-aborted request.
@@ -514,11 +472,10 @@ def shutdown(self):
         if self.proc_handle.proc.is_alive():
             kill_process_tree(self.proc_handle.proc.pid)
 
-    async def add_request_async(self, request: DetokenizerRequest):
+    async def add_request_async(self, request: EngineRequest):
         """Send new DetokenizerRequest to Detokenizer."""
 
-        msg = (self.encoder.encode(request), )
-        await self.input_socket.send_multipart(msg, copy=False)
+        await self.input_socket.send_pyobj(request)
 
     async def get_output_async(self) -> List[RequestOutput]:
         """Get RequestOutputs, RequestsToAbort from Detokenizer."""

From 6f0adfee1dbae03e2ec3e40b0ac453c854ae645b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 23:15:26 +0000
Subject: [PATCH 061/132] working again

---
 tests/v1/engine/test_engine_core.py        |   6 +-
 tests/v1/engine/test_engine_core_client.py |   6 +-
 vllm/v1/engine/__init__.py                 |   4 +-
 vllm/v1/engine/async_llm.py                |  11 ++-
 vllm/v1/engine/core.py                     |  38 ++++----
 vllm/v1/engine/core_client.py              | 101 +++------------------
 vllm/v1/engine/detokenizer.py              |  51 ++++++-----
 vllm/v1/engine/llm_engine.py               |   4 +-
 vllm/v1/engine/processor.py                |   2 +-
 vllm/v1/request.py                         |   4 +-
 10 files changed, 81 insertions(+), 146 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index a61ec63a365b5..5c9bfa02a5b0f 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -8,7 +8,7 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine import EngineRequest
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core import EngineCore
 
@@ -22,8 +22,8 @@
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
-def make_request() -> EngineCoreRequest:
-    return EngineCoreRequest(
+def make_request() -> EngineRequest:
+    return EngineRequest(
         request_id=uuid.uuid4(),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 2f1cbec607a91..20db30e8b1223 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -10,7 +10,7 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine import EngineRequest
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import EngineCoreClient
 
@@ -24,8 +24,8 @@
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
-def make_request(params: SamplingParams) -> EngineCoreRequest:
-    return EngineCoreRequest(
+def make_request(params: SamplingParams) -> EngineRequest:
+    return EngineRequest(
         request_id=str(uuid.uuid4()),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index df0001679e555..d55484ee524d8 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -66,7 +66,7 @@ class EngineCoreProfile:
     is_start: bool
 
 
-class EngineCoreRequestType(enum.Enum):
+class EngineRequestType(enum.Enum):
     """
     Request types defined as hex byte strings, so it can be sent over sockets
     without separate encoding step.
@@ -76,4 +76,4 @@ class EngineCoreRequestType(enum.Enum):
     PROFILE = b'\x02'
 
 
-EngineCoreRequestUnion = Union[EngineRequest, EngineCoreProfile, List[str]]
+EngineRequestUnion = Union[EngineRequest, EngineCoreProfile, List[str]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 8dcf8de1a092e..6164fe1cf509a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -18,7 +18,7 @@
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import get_open_zmq_ipc_path
-from vllm.v1.engine.core_client import AsyncMPClient
+from vllm.v1.engine.core_client import MultiprocessEngineCore
 from vllm.v1.engine.detokenizer import DetokenizerClient
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
@@ -64,18 +64,20 @@ def __init__(
         # List of cancelled request ids to be aborted.
         self.client_aborted_requests: List[str] = []
 
-        # Processor (converts Inputs --> EngineCoreRequests).
+        # Processor (converts Inputs --> EngineRequest).
         self.processor = Processor(vllm_config.model_config,
                                    vllm_config.lora_config, self.tokenizer,
                                    input_registry)
 
 
-        # IPC path for EngineCore -> Detokenizer.
+        # IPC paths.
         engine_core_outputs_path = get_open_zmq_ipc_path()
+        engine_core_inputs_path = get_open_zmq_ipc_path()
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
         self.detokenizer = DetokenizerClient(
             engine_core_outputs_path=engine_core_outputs_path,
+            engine_core_inputs_path=engine_core_inputs_path,
             tokenizer_name=vllm_config.model_config.tokenizer,
             tokenizer_mode=vllm_config.model_config.tokenizer_mode,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
@@ -83,7 +85,8 @@ def __init__(
         )
 
         # EngineCore (starts the engine in background process).
-        self.engine_core = AsyncMPClient(
+        self.engine_core = MultiprocessEngineCore(
+            input_path=engine_core_inputs_path,
             output_path=engine_core_outputs_path,
             vllm_config=vllm_config,
             executor_class=executor_class,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e35e5ac464305..9e985be743e0d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -19,7 +19,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion,
+                            EngineRequestType, EngineRequestUnion,
                             BackgroundProcHandle)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
@@ -155,7 +155,7 @@ def __init__(
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
+        self.input_queue: queue.Queue[EngineRequestUnion] = queue.Queue()
         self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
@@ -289,10 +289,10 @@ def _log_stats(self):
 
             self._last_logging_time = now
 
-    def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
-        """Handle EngineCoreRequest or EngineCoreABORT from Client."""
+    def _handle_client_request(self, request: EngineRequestUnion) -> None:
+        """Handle EngineRequest or EngineCoreABORT from Client."""
 
-        if isinstance(request, EngineCoreRequest):
+        if isinstance(request, EngineRequest):
             self.add_request(request)
         elif isinstance(request, EngineCoreProfile):
             self.model_executor.profile(request.is_start)
@@ -311,21 +311,23 @@ def process_input_socket(self, input_path: str):
         with zmq_socket_ctx(input_path, zmq.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
-                type_frame, data_frame = socket.recv_multipart(copy=False)
-                request_type = type_frame.buffer
-                request_data = data_frame.buffer
-
-                # Deserialize the request data.
-                if request_type == EngineCoreRequestType.ADD.value:
-                    request = decoder_add_req.decode(request_data)
-                elif request_type == EngineCoreRequestType.ABORT.value:
-                    request = decoder_abort_req.decode(request_data)
-                elif request_type == EngineCoreRequestType.PROFILE.value:
-                    request = pickle.loads(request_data)
-                else:
-                    raise ValueError(f"Unknown RequestType: {request_type}")
+                # type_frame, data_frame = socket.recv_multipart(copy=False)
+                # request_type = type_frame.buffer
+                # request_data = data_frame.buffer
+                
+
+                # # Deserialize the request data.
+                # if request_type == EngineRequestType.ADD.value:
+                #     request = decoder_add_req.decode(request_data)
+                # elif request_type == EngineRequestType.ABORT.value:
+                #     request = decoder_abort_req.decode(request_data)
+                # elif request_type == EngineRequestType.PROFILE.value:
+                #     request = pickle.loads(request_data)
+                # else:
+                #     raise ValueError(f"Unknown RequestType: {request_type}")
 
                 # Push to input queue for core busy loop.
+                request = socket.recv_pyobj()
                 self.input_queue.put_nowait(request)
 
     def process_output_socket(self, output_path: str):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 7559ca1af2a03..d094e2de11b85 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -10,8 +10,8 @@
 from vllm.utils import kill_process_tree, get_open_zmq_ipc_path
 from vllm.v1.engine import (BackgroundProcHandle,
                             EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion)
+                            EngineCoreProfile, EngineRequest,
+                            EngineRequestType, EngineRequestUnion)
 from vllm.v1.engine.core import (EngineCore, EngineCoreProc)
 from vllm.v1.serial_utils import PickleEncoder
 from vllm.v1.utils import make_zmq_socket
@@ -58,7 +58,7 @@ def shutdown(self):
     def get_output(self) -> List[EngineCoreOutput]:
         raise NotImplementedError
 
-    def add_request(self, request: EngineCoreRequest) -> None:
+    def add_request(self, request: EngineRequest) -> None:
         raise NotImplementedError
 
     def profile(self, is_start: bool = True) -> None:
@@ -70,7 +70,7 @@ def abort_requests(self, request_ids: List[str]) -> None:
     async def get_output_async(self) -> List[EngineCoreOutput]:
         raise NotImplementedError
 
-    async def add_request_async(self, request: EngineCoreRequest) -> None:
+    async def add_request_async(self, request: EngineRequest) -> None:
         raise NotImplementedError
 
     async def profile_async(self, is_start: bool = True) -> None:
@@ -86,7 +86,7 @@ class InprocClient(EngineCoreClient):
     for use in LLMEngine for V0-style add_request() and step()
         EngineCore setup in this process (no busy loop).
 
-        * pushes EngineCoreRequest directly into the EngineCore
+        * pushes EngineRequest directly into the EngineCore
         * pulls EngineCoreOutputs by stepping the EngineCore
 
         TODO: support asyncio-mode for debugging.
@@ -98,7 +98,7 @@ def __init__(self, *args, **kwargs):
     def get_output(self) -> List[EngineCoreOutput]:
         return self.engine_core.step()
 
-    def add_request(self, request: EngineCoreRequest) -> None:
+    def add_request(self, request: EngineRequest) -> None:
         self.engine_core.add_request(request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
@@ -114,53 +114,29 @@ def profile(self, is_start: bool = True) -> None:
         self.engine_core.profile(is_start)
 
 
-class MPClient(EngineCoreClient):
+class MultiprocessEngineCore:
     """
-    MPClient: base client for multi-proc EngineCore.
+    MultiprocessEngineCore: base client for multi-proc EngineCore.
         EngineCore runs in a background process busy loop, getting
-        new EngineCoreRequests and returning EngineCoreOutputs
+        new EngineRequests and returning EngineCoreOutputs
 
-        * pushes EngineCoreRequests via input_socket
+        * pushes EngineRequests via input_socket
         * pulls EngineCoreOutputs via output_socket
-    
-        * AsyncMPClient subclass for AsyncLLM usage
-        * SyncMPClient subclass for LLM usage
     """
 
     def __init__(
         self,
         *args,
-        asyncio_mode: bool,
+        input_path: Optional[str] = None,
         output_path: Optional[str] = None,
         **kwargs,
     ):
-        # Serialization setup.
-        self.encoder = PickleEncoder()
-        self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
-
-        # ZMQ setup.
-        if asyncio_mode:
-            print("HERE HERE HERE")
-            self.ctx = zmq.asyncio.Context(io_threads=2)
-        else:
-            self.ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
-
-        input_path = get_open_zmq_ipc_path()
-        self.input_socket = make_zmq_socket(
-            self.ctx,
-            input_path,
-            zmq.PUSH,
-        )
-
-        if output_path is None:
-            output_path = get_open_zmq_ipc_path()
-
         # Start EngineCore in background process.
         self.proc_handle: Optional[BackgroundProcHandle]
         self.proc_handle = EngineCoreProc.make_engine_core_process(
             *args,
-            input_path=input_path,
-            output_path=output_path,
+            input_path=(input_path or get_open_zmq_ipc_path()),
+            output_path=(output_path or get_open_zmq_ipc_path()),
             **kwargs,
         )
         atexit.register(self.shutdown)
@@ -172,9 +148,6 @@ def shutdown(self):
             # in case shutdown gets called via __del__ first
             atexit.unregister(self.shutdown)
 
-        # Shut down the zmq context.
-        self.ctx.destroy(linger=0)
-
         if hasattr(self, "proc_handle") and self.proc_handle:
             # Shutdown the process if needed.
             if self.proc_handle.proc.is_alive():
@@ -197,51 +170,3 @@ def shutdown(self):
 
     def __del__(self):
         self.shutdown()
-
-
-class SyncMPClient(MPClient):
-    """Synchronous client for multi-proc EngineCore."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, asyncio_mode=False, **kwargs)
-
-    def _send_input(self, request_type: EngineCoreRequestType,
-                    request: EngineCoreRequestUnion) -> None:
-
-        # (RequestType, SerializedRequest)
-        msg = (request_type.value, self.encoder.encode(request))
-        self.input_socket.send_multipart(msg, copy=False)
-
-    def add_request(self, request: EngineCoreRequest) -> None:
-        self._send_input(EngineCoreRequestType.ADD, request)
-
-    def abort_requests(self, request_ids: List[str]) -> None:
-        self._send_input(EngineCoreRequestType.ABORT, request_ids)
-
-    def profile(self, is_start: bool = True) -> None:
-        self._send_input(EngineCoreRequestType.PROFILE,
-                         EngineCoreProfile(is_start))
-
-
-class AsyncMPClient(MPClient):
-    """Asyncio-compatible client for multi-proc EngineCore."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, asyncio_mode=True, **kwargs)
-
-    async def _send_input(self, request_type: EngineCoreRequestType,
-                          request: EngineCoreRequestUnion) -> None:
-
-        msg = (request_type.value, self.encoder.encode(request))
-        await self.input_socket.send_multipart(msg, copy=False)
-
-    async def add_request_async(self, request: EngineCoreRequest) -> None:
-        await self._send_input(EngineCoreRequestType.ADD, request)
-
-    async def abort_requests_async(self, request_ids: List[str]) -> None:
-        if len(request_ids) > 0:
-            await self._send_input(EngineCoreRequestType.ABORT, request_ids)
-
-    async def profile_async(self, is_start: bool = True) -> None:
-        await self._send_input(EngineCoreRequestType.PROFILE,
-                               EngineCoreProfile(is_start))
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 91b5f7a9c1800..7c5ce6fbbfed9 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -3,7 +3,7 @@
 import msgspec
 import signal
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Iterable, List, Optional, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.executor.multiproc_worker_utils import get_mp_context
@@ -75,7 +75,7 @@ def from_new_request(
             skip_special_tokens=sampling_params.skip_special_tokens,
         )
 
-        stops = request.sampling_params
+        stops = request.sampling_params.stop
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
         if stops and not sampling_params.include_stop_str_in_output:
@@ -231,20 +231,7 @@ def abort_requests(
         """Remove the request_ids from the Detokenizer."""
 
         for request_id in request_ids:
-            self.request_states.pop(request_id, None)
-
-    def add_request(
-        self,
-        request: EngineRequest,
-    ):
-        """Add new request to the Detokenizer."""
-
-        assert (request.request_id not in self.request_states)
-
-        request_state = IncrementalDetokenizer.from_new_request(
-            self.tokenizer, request)
-        self.request_states[request.request_id] = request_state
-        
+            self.request_states.pop(request_id, None)        
         
     def step(
         self, encore_core_outputs: List[EngineCoreOutput]
@@ -295,6 +282,7 @@ def __init__(
         self,
         *args,
         engine_core_outputs_path: str,
+        engine_core_inputs_path: str,
         input_path: str,
         output_path: str,
         ready_path: str,
@@ -303,6 +291,7 @@ def __init__(
         super().__init__(*args, **kwargs)
 
         self.engine_core_outputs_path = engine_core_outputs_path
+        self.engine_core_inputs_path = engine_core_inputs_path
         self.input_path = input_path
         self.output_path = output_path
 
@@ -314,6 +303,7 @@ def __init__(
     @staticmethod
     def make_detokenizer_process(
         engine_core_outputs_path: str,
+        engine_core_inputs_path: str,
         input_path: str,
         output_path: str,
         tokenizer_name: str,
@@ -326,6 +316,7 @@ def make_detokenizer_process(
 
         process_kwargs = {
             "engine_core_outputs_path": engine_core_outputs_path,
+            "engine_core_inputs_path": engine_core_inputs_path,
             "input_path": input_path,
             "output_path": output_path,
             "ready_path": ready_path,
@@ -391,12 +382,12 @@ def run_busy_loop(self):
             # TODO: pickle -> msgpack
             # TODO: send stop string aborts back to EngineCore directly
 
-            decoder_new = msgspec.msgpack.Decoder(EngineRequest)
             decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
             with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, 
                   zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine,
-                  zmq_socket_ctx(self.output_path, zmq.PUSH) as output_socket):
+                  zmq_socket_ctx(self.engine_core_inputs_path, zmq.PUSH) as to_engine_core,
+                  zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine):
 
                 # TODO: avoid poll by having both EngineCore
                 # and AsyncLLM send to the same socket (unclear why this 
@@ -414,16 +405,25 @@ def run_busy_loop(self):
 
                     # Handle NewRequest.
                     if from_llm_engine in socks:
-                        (frame, ) = from_llm_engine.recv_multipart(copy=False)
-                        engine_request = decoder_new.decode(frame.buffer)
-                        self.add_request(engine_request)
+                        pickled_request = from_llm_engine.recv()
+                        request = pickle.loads(pickled_request)
+
+                        assert (request.request_id not in self.request_states)
+
+                        # Add to Detokenizer.
+                        request_state = IncrementalDetokenizer.from_new_request(
+                            self.tokenizer, request)
+                        self.request_states[request.request_id] = request_state
+
+                        # Forward to EngineCore.
+                        to_engine_core.send(pickled_request)
 
                     # Handle EngineCoreOutput.
                     if from_engine_core in socks:
                         (frame, ) = from_engine_core.recv_multipart(copy=False)
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
                         request_outputs, _ = self.step(engine_core_outputs)
-                        output_socket.send_pyobj(request_outputs)
+                        to_llm_engine.send_pyobj(request_outputs)
         
         except Exception as e:
             logger.error(e)
@@ -431,7 +431,11 @@ def run_busy_loop(self):
 
 class DetokenizerClient:
     
-    def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
+    def __init__(self,
+                 *args,
+                 engine_core_outputs_path: str,
+                 engine_core_inputs_path: str,
+                 **kwargs):
 
         # Serialization setup.
         self.encoder = msgspec.msgpack.Encoder()
@@ -460,6 +464,7 @@ def __init__(self, *args, engine_core_outputs_path: str, **kwargs):
         self.proc_handle = DetokenizerProc.make_detokenizer_process(
             *args,
             engine_core_outputs_path=engine_core_outputs_path,
+            engine_core_inputs_path=engine_core_inputs_path,
             input_path=input_path,
             output_path=output_path,
             **kwargs,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 15dedbd0f9529..1e508c5d240e1 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -54,7 +54,7 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Processor (convert Inputs --> EngineCoreRequests)
+        # Processor (convert Inputs --> EngineRequests)
         self.processor = Processor(vllm_config.model_config,
                                    vllm_config.lora_config, self.tokenizer,
                                    input_registry, mm_registry)
@@ -67,7 +67,7 @@ def __init__(
             revision=vllm_config.model_config.tokenizer_revision,
         )
 
-        # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
+        # EngineCore (gets EngineRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
             vllm_config,
             executor_class,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index d37eab3418c3e..60beecb24a61c 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -14,7 +14,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.v1.engine import DetokenizerData, EngineRequest
+from vllm.v1.engine import EngineRequest
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 1737d096e811d..e6de57dab7672 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,7 +6,7 @@
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine import EngineRequest
 from vllm.v1.utils import ConstantList
 
 
@@ -57,7 +57,7 @@ def __init__(
             self.mm_inputs = self.inputs.multi_modal_inputs
 
     @classmethod
-    def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
+    def from_engine_core_request(cls, request: EngineRequest) -> "Request":
         return cls(
             request_id=request.request_id,
             inputs=token_inputs(

From 5d2c9ae87e5d3055ce105398a6f8a01f73f85fc3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 23:34:19 +0000
Subject: [PATCH 062/132] design without incremental streaming seems okay

---
 vllm/v1/engine/async_llm.py   |  7 ++-----
 vllm/v1/engine/detokenizer.py | 11 ++++-------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6164fe1cf509a..d6e2bfe4fc04b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -240,8 +240,8 @@ def signal_handler(self, signum=None, frame=None):
                 out = await asyncio.wait_for(queue.get(), timeout=4)
 
                 q_size = queue.qsize()
-                if q_size > 0:
-                    logger.info(f"{q_size}")
+                # if q_size > 0:
+                #     logger.info(f"{q_size=}")
                 if out.finished:
                     del self.rid_to_queue[request_id]
                     yield out
@@ -253,9 +253,6 @@ def signal_handler(self, signum=None, frame=None):
                 # TODO(rob): do request cancellation checking here.
                 # logger.debug("Timeout waiting for %s", request_id)
                 continue
-
-        
-
                 
 
     # async def _process_cancellations(self) -> None:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 7c5ce6fbbfed9..6f03ec57e1105 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -399,20 +399,18 @@ def run_busy_loop(self):
                 epoch = 0
                 while True:
                     logger.info(f"EPOCH: {epoch}")
-                    epoch += 1
 
                     socks = dict(poller.poll())
 
                     # Handle NewRequest.
                     if from_llm_engine in socks:
                         pickled_request = from_llm_engine.recv()
-                        request = pickle.loads(pickled_request)
+                        request: EngineRequest = pickle.loads(pickled_request)
 
                         assert (request.request_id not in self.request_states)
 
                         # Add to Detokenizer.
-                        request_state = IncrementalDetokenizer.from_new_request(
-                            self.tokenizer, request)
+                        request_state = IncrementalDetokenizer.from_new_request(self.tokenizer, request)
                         self.request_states[request.request_id] = request_state
 
                         # Forward to EngineCore.
@@ -420,6 +418,8 @@ def run_busy_loop(self):
 
                     # Handle EngineCoreOutput.
                     if from_engine_core in socks:
+                        epoch += 1
+
                         (frame, ) = from_engine_core.recv_multipart(copy=False)
                         engine_core_outputs = decoder_out.decode(frame.buffer).outputs
                         request_outputs, _ = self.step(engine_core_outputs)
@@ -436,9 +436,6 @@ def __init__(self,
                  engine_core_outputs_path: str,
                  engine_core_inputs_path: str,
                  **kwargs):
-
-        # Serialization setup.
-        self.encoder = msgspec.msgpack.Encoder()
         
         # ZMQ setup.
         self.ctx = zmq.asyncio.Context(2)

From 0574b89b8051f142f7c40089b201f71624fef183 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 21 Dec 2024 23:35:58 +0000
Subject: [PATCH 063/132] updated

---
 vllm/v1/core/scheduler.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index c83c931f75fea..f76364f64033d 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -140,7 +140,6 @@ def schedule(self) -> "SchedulerOutput":
                     preempted_req.status = RequestStatus.PREEMPTED
                     preempted_req.num_computed_tokens = 0
 
-                    logger.info(f"Preempted: {preempted_req.request_id}")
                     self.waiting.appendleft(preempted_req)
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
@@ -425,12 +424,10 @@ def update_from_output(
                 # Check for stop and update request state.
                 # This must be called before me make the EngineCoreOutput.
                 stopped = self._check_stop(request)
-                
+
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
                     request_id=req_id,
-                    prompt=request.prompt,
-                    prompt_token_ids=request.prompt_token_ids,
                     new_token_ids=request.output_token_ids[-num_new_tokens:],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),

From 19a7cd011ea04936a2146dc847241dc987c9f90c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 13:50:04 +0000
Subject: [PATCH 064/132] updated

---
 benchmarks/backend_request_func.py | 4 ++++
 vllm/v1/engine/__init__.py         | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index b67849038cf0d..fb3e7c994d4d6 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -295,6 +295,7 @@ async def async_request_openai_completions(
                     if first_chunk_received:
                         output.success = True
                     else:
+                        print("error 0!")
                         output.success = False
                         output.error = (
                             "Never received a valid chunk to calculate TTFT."
@@ -302,12 +303,15 @@ async def async_request_openai_completions(
                     output.generated_text = generated_text
                     output.latency = latency
                 else:
+                    print("error 1!")
                     output.error = response.reason or ""
                     output.success = False
         except Exception:
+            print("error 2!")
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+            print(f"{output.error=}")
 
     if pbar:
         pbar.update(1)
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index d55484ee524d8..9e4f8a9d6e29a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -40,8 +40,6 @@ class EngineCoreOutput(
         gc=False):  # type: ignore[call-arg]
 
     request_id: str
-    prompt_token_ids: List[int]
-    prompt: Optional[str]
     new_token_ids: List[int]
     finished: bool
     finish_reason: Optional[str] = None

From 067d487b51556f84d3555eff24b4fd1a354b038a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 13:52:56 +0000
Subject: [PATCH 065/132] updated

---
 vllm/v1/engine/async_llm.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d6e2bfe4fc04b..a300af75c2cf8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -25,9 +25,6 @@
 
 logger = init_logger(__name__)
 
-import uvloop
-asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-
 
 class AsyncLLM(EngineClient):
 

From c1c8749fd16cd416216d95dbfe2eafc8811df32e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 13:54:56 +0000
Subject: [PATCH 066/132] more cleanup

---
 vllm/entrypoints/openai/serving_completion.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index d87c410c0124c..bd39a4c42e938 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -159,10 +159,8 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        # result_generator = merge_async_iterators(
-        #     *generators, is_cancelled=raw_request.is_disconnected)
-        assert len(generators) == 1
-        result_generator = generators[0]
+        result_generator = merge_async_iterators(
+            *generators, is_cancelled=raw_request.is_disconnected)
 
         model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
@@ -258,9 +256,7 @@ async def completion_stream_generator(
             include_usage, include_continuous_usage = False, False
 
         try:
-            # async for prompt_idx, res in result_generator:
-            async for res in result_generator:
-                prompt_idx = 0
+            async for prompt_idx, res in result_generator:
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt

From ceacaddce4f203de19cbac7f1690ab014add2dee Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 14:08:37 +0000
Subject: [PATCH 067/132] working e2e with the fds

---
 vllm/v1/engine/async_llm.py | 82 +++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 39 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a300af75c2cf8..9c6206ddb5c31 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -206,50 +206,54 @@ async def generate(
         to the caller which iterates the AsyncGenerator.
         """
 
-        # We start the output_handler on the first call to generate() so that
-        # we can call __init__ before the event loop starts, which enables us
-        # to handle startup failure gracefully in the OpenAI server.
-        # if self.output_handler is None:
-        if self.to_create_loop:
-            import signal
-            def signal_handler(self, signum=None, frame=None):
-                logger.warning(
-                    f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
+        try:
+            # We start the output_handler on the first call to generate() so that
+            # we can call __init__ before the event loop starts, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            # if self.output_handler is None:
+            if self.to_create_loop:
+                import signal
+                def signal_handler(self, signum=None, frame=None):
+                    logger.warning(
+                        f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
+                )
+
+                self.to_create_loop = False
+                loop = asyncio.get_event_loop()
+                loop.create_task(self._run_output_handler())
+                loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+            queue = await self.add_request(
+                request_id,
+                prompt,
+                sampling_params,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                prompt_adapter_request=prompt_adapter_request,
+                priority=priority,
             )
 
-            self.to_create_loop = False
-            loop = asyncio.get_event_loop()
-            loop.create_task(self._run_output_handler())
-            loop.add_signal_handler(signal.SIGTERM, signal_handler)
-
-        queue = await self.add_request(
-            request_id,
-            prompt,
-            sampling_params,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            prompt_adapter_request=prompt_adapter_request,
-            priority=priority,
-        )
+            while True:
+                try:
+                    out = await asyncio.wait_for(queue.get(), timeout=4)
 
-        while True:
-            try:
-                out = await asyncio.wait_for(queue.get(), timeout=4)
-
-                q_size = queue.qsize()
-                # if q_size > 0:
-                #     logger.info(f"{q_size=}")
-                if out.finished:
-                    del self.rid_to_queue[request_id]
-                    yield out
-                    break
+                    q_size = queue.qsize()
+                    # if q_size > 0:
+                    #     logger.info(f"{q_size=}")
+                    if out.finished:
+                        del self.rid_to_queue[request_id]
+                        yield out
+                        break
 
-                yield out
+                    yield out
 
-            except asyncio.TimeoutError:
-                # TODO(rob): do request cancellation checking here.
-                # logger.debug("Timeout waiting for %s", request_id)
-                continue
+                except asyncio.TimeoutError:
+                    # TODO(rob): do request cancellation checking here.
+                    # logger.debug("Timeout waiting for %s", request_id)
+                    continue
+        except BaseException as e:
+            logger.error(repr(e))
+            raise e
                 
 
     # async def _process_cancellations(self) -> None:

From 630c72feea07861f10872d7c1b237ef525be4430 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 14:12:21 +0000
Subject: [PATCH 068/132] fix

---
 benchmarks/backend_request_func.py            | 4 ----
 vllm/entrypoints/openai/serving_completion.py | 9 ++++++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index fb3e7c994d4d6..b67849038cf0d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -295,7 +295,6 @@ async def async_request_openai_completions(
                     if first_chunk_received:
                         output.success = True
                     else:
-                        print("error 0!")
                         output.success = False
                         output.error = (
                             "Never received a valid chunk to calculate TTFT."
@@ -303,15 +302,12 @@ async def async_request_openai_completions(
                     output.generated_text = generated_text
                     output.latency = latency
                 else:
-                    print("error 1!")
                     output.error = response.reason or ""
                     output.success = False
         except Exception:
-            print("error 2!")
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
-            print(f"{output.error=}")
 
     if pbar:
         pbar.update(1)
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index bd39a4c42e938..019a16ed654e5 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -159,8 +159,9 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators, is_cancelled=raw_request.is_disconnected)
+        # result_generator = merge_async_iterators(
+        #     *generators, is_cancelled=raw_request.is_disconnected)
+        result_generator = generator
 
         model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
@@ -256,7 +257,9 @@ async def completion_stream_generator(
             include_usage, include_continuous_usage = False, False
 
         try:
-            async for prompt_idx, res in result_generator:
+            # async for prompt_idx, res in result_generator:
+            async for res in result_generator:
+                prompt_idx = 0
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt

From e02101233b93d3c98743456bd8e6a2ac3b8dab67 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 14:20:52 +0000
Subject: [PATCH 069/132] updated

---
 vllm/v1/engine/async_llm.py | 83 +++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 44 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 9c6206ddb5c31..d16a33bbbbad3 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -206,55 +206,50 @@ async def generate(
         to the caller which iterates the AsyncGenerator.
         """
 
-        try:
-            # We start the output_handler on the first call to generate() so that
-            # we can call __init__ before the event loop starts, which enables us
-            # to handle startup failure gracefully in the OpenAI server.
-            # if self.output_handler is None:
-            if self.to_create_loop:
-                import signal
-                def signal_handler(self, signum=None, frame=None):
-                    logger.warning(
-                        f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
-                )
-
-                self.to_create_loop = False
-                loop = asyncio.get_event_loop()
-                loop.create_task(self._run_output_handler())
-                loop.add_signal_handler(signal.SIGTERM, signal_handler)
-
-            queue = await self.add_request(
-                request_id,
-                prompt,
-                sampling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                prompt_adapter_request=prompt_adapter_request,
-                priority=priority,
+        # We start the output_handler on the first call to generate() so that
+        # we can call __init__ before the event loop starts, which enables us
+        # to handle startup failure gracefully in the OpenAI server.
+        # if self.output_handler is None:
+        if self.to_create_loop:
+            import signal
+            def signal_handler(self, signum=None, frame=None):
+                logger.warning(
+                    f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
             )
 
-            while True:
-                try:
-                    out = await asyncio.wait_for(queue.get(), timeout=4)
-
-                    q_size = queue.qsize()
-                    # if q_size > 0:
-                    #     logger.info(f"{q_size=}")
-                    if out.finished:
-                        del self.rid_to_queue[request_id]
-                        yield out
-                        break
+            self.to_create_loop = False
+            loop = asyncio.get_event_loop()
+            loop.create_task(self._run_output_handler())
+            loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+        queue = await self.add_request(
+            request_id,
+            prompt,
+            sampling_params,
+            lora_request=lora_request,
+            trace_headers=trace_headers,
+            prompt_adapter_request=prompt_adapter_request,
+            priority=priority,
+        )
 
+        while True:
+            try:
+                out = await asyncio.wait_for(queue.get(), timeout=4)
+
+                q_size = queue.qsize()
+                # if q_size > 0:
+                #     logger.info(f"{q_size=}")
+                if out.finished:
+                    del self.rid_to_queue[request_id]
                     yield out
+                    break
+
+                yield out
 
-                except asyncio.TimeoutError:
-                    # TODO(rob): do request cancellation checking here.
-                    # logger.debug("Timeout waiting for %s", request_id)
-                    continue
-        except BaseException as e:
-            logger.error(repr(e))
-            raise e
-                
+            except asyncio.TimeoutError:
+                # TODO(rob): do request cancellation checking here.
+                # logger.debug("Timeout waiting for %s", request_id)
+                continue
 
     # async def _process_cancellations(self) -> None:
     #     """

From c58d0ff348477b680e8dd6e10787af1f89f64c95 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 14:45:12 +0000
Subject: [PATCH 070/132] performance is now good

---
 vllm/v1/engine/async_llm.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d16a33bbbbad3..b078b05369e6d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -234,16 +234,16 @@ def signal_handler(self, signum=None, frame=None):
 
         while True:
             try:
-                out = await asyncio.wait_for(queue.get(), timeout=4)
+                if queue.qsize() > 0:
+                    out = queue.get_nowait()
+                else:
+                    out = await asyncio.wait_for(queue.get(), timeout=4)
 
-                q_size = queue.qsize()
-                # if q_size > 0:
-                #     logger.info(f"{q_size=}")
                 if out.finished:
                     del self.rid_to_queue[request_id]
                     yield out
                     break
-
+                    
                 yield out
 
             except asyncio.TimeoutError:

From d7af4bcc0af97bc870ee729fc4dfebb0b9cd985b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 14:59:54 +0000
Subject: [PATCH 071/132] updated

---
 vllm/v1/engine/async_llm.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b078b05369e6d..7b87b7ce2c819 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,5 +1,5 @@
 import asyncio
-from dataclasses import dataclass
+import fastapi
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -234,6 +234,8 @@ def signal_handler(self, signum=None, frame=None):
 
         while True:
             try:
+                # Note: drain queue without awaiting if possible (this helps 
+                # to avoid task switching under load + helps performance)
                 if queue.qsize() > 0:
                     out = queue.get_nowait()
                 else:
@@ -247,8 +249,7 @@ def signal_handler(self, signum=None, frame=None):
                 yield out
 
             except asyncio.TimeoutError:
-                # TODO(rob): do request cancellation checking here.
-                # logger.debug("Timeout waiting for %s", request_id)
+                logger.debug("%s request timed out waiting", request_id)
                 continue
 
     # async def _process_cancellations(self) -> None:
@@ -308,7 +309,7 @@ async def _run_output_handler(self):
 
     async def abort(self, request_id: str) -> None:
         # Note: this is not used outside of testing.
-        raise ValueError("Not Supported on V1 yet.")
+        pass
 
     def encode(
         self,

From 091435158fbb068662509bb455bfee48fe6d8b8f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 16:06:45 +0000
Subject: [PATCH 072/132] updated

---
 vllm/v1/engine/__init__.py    |  26 +++---
 vllm/v1/engine/async_llm.py   | 137 ++++++++++++----------------
 vllm/v1/engine/core.py        |  41 +++------
 vllm/v1/engine/core_client.py |   3 -
 vllm/v1/engine/detokenizer.py | 164 +++++++++++++++++++---------------
 5 files changed, 171 insertions(+), 200 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 9e4f8a9d6e29a..4e84d763ae9be 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -32,6 +32,17 @@ class EngineRequest:
     arrival_time: float
     lora_request: Optional[LoRARequest]
 
+@dataclass
+class EngineAbortRequest:
+
+    request_ids: List[str]
+
+@dataclass
+class EngineProfileRequest:
+
+    is_start: bool
+
+EngineRequestUnion = Union[EngineRequest, EngineAbortRequest, EngineProfileRequest]
 
 class EngineCoreOutput(
         msgspec.Struct,
@@ -59,19 +70,4 @@ class EngineCoreOutputs(
     outputs: List[EngineCoreOutput]
 
 
-@dataclass
-class EngineCoreProfile:
-    is_start: bool
-
-
-class EngineRequestType(enum.Enum):
-    """
-    Request types defined as hex byte strings, so it can be sent over sockets
-    without separate encoding step.
-    """
-    ADD = b'\x00'
-    ABORT = b'\x01'
-    PROFILE = b'\x02'
-
 
-EngineRequestUnion = Union[EngineRequest, EngineCoreProfile, List[str]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index f8b02ba7fa936..6e15665df61de 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -18,6 +18,7 @@
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import get_open_zmq_ipc_path
+from vllm.v1.engine import EngineAbortRequest
 from vllm.v1.engine.core_client import MultiprocessEngineCore
 from vllm.v1.engine.detokenizer import DetokenizerClient
 from vllm.v1.engine.processor import Processor
@@ -92,7 +93,6 @@ def __init__(
             usage_context=usage_context,
         )
 
-        # self.output_handler: Optional[asyncio.Task] = None
         self.to_create_loop = True
 
     def __del__(self):
@@ -172,11 +172,11 @@ async def add_request(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
         
-        # 2) Create Queue (output_handler pushes, generate pulls)
+        # 2) Create Queue (output_handler() pushes, generate() pulls)
         self.rid_to_queue[request_id] = asyncio.Queue()
 
-        # 3) Send to Detokenizer.
-        await self.detokenizer.add_request_async(engine_request)
+        # 3) Send to Detokenizer (which forwards to EngineCore).
+        await self.detokenizer.input_socket.send_pyobj(engine_request)
 
         return self.rid_to_queue[request_id]
 
@@ -197,93 +197,68 @@ async def generate(
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
-            * 1) Make a queue corresponding to the Request.
-            # 2) Processing the Input.
-            * 3) Adding the Request to the Detokenize + EngineCore.
+            * 1) Make an output queue for the Request.
+            # 2) Processing the Input (e.g. Tokenizer).
+            * 3) Adding the Request to Detokenizer + EngineCore.
 
         The output_handler() loop runs in a background task, pulling
         from Detokenizer and pushing to the per request queue.
 
-        The generate() pulls from the per requests queue and yeilds
+        The generate() pulls from the per request queue and yeilds
         to the caller which iterates the AsyncGenerator.
         """
 
-        # We start the output_handler on the first call to generate() so that
-        # we can call __init__ before the event loop starts, which enables us
-        # to handle startup failure gracefully in the OpenAI server.
-        if self.to_create_loop:
-            import signal
-            def signal_handler(self, signum=None, frame=None):
-                logger.warning(
-                    f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
+        try:
+            # We start the output_handler on the first call to generate() so that
+            # we can call __init__ before the event loop starts, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            if self.to_create_loop:
+                import signal
+                def signal_handler(self, signum=None, frame=None):
+                    logger.warning(
+                        f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
+                )
+
+                self.to_create_loop = False
+                loop = asyncio.get_event_loop()
+                loop.create_task(self.output_handler())
+                loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+            q = await self.add_request(
+                request_id,
+                prompt,
+                sampling_params,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                prompt_adapter_request=prompt_adapter_request,
+                priority=priority,
             )
 
-            self.to_create_loop = False
-            loop = asyncio.get_event_loop()
-            loop.create_task(self._run_output_handler())
-            loop.add_signal_handler(signal.SIGTERM, signal_handler)
-
-        queue = await self.add_request(
-            request_id,
-            prompt,
-            sampling_params,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            prompt_adapter_request=prompt_adapter_request,
-            priority=priority,
-        )
-
-        while True:
-            try:
-                # Note: drain queue without awaiting if possible (this helps 
-                # to avoid task switching under load + helps performance).
-                if queue.qsize() > 0:
-                    out = queue.get_nowait()
-                else:
-                    out = await asyncio.wait_for(queue.get(), timeout=4)
+            # The output_handler task pushes items into the queue.
+            # This task pulls from the queue and yields them.
+            while True:
+                # Note: drain queue without await if possible (avoids
+                # task switching under load --> helps performance).
+                out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
+                # Note: both Detokenizer and EngineCore handle their
+                # own cleanup based on finished.
                 if out.finished:
                     del self.rid_to_queue[request_id]
                     yield out
                     break
-                    
-                yield out
-
-            except asyncio.TimeoutError:
-                logger.debug("%s request timed out waiting", request_id)
-                continue
-
-    # async def _process_cancellations(self) -> None:
-    #     """
-    #     Process requests cancelled from user disconnecting.
-
-    #     When a client disconnects, AsyncStream._cancel() is called.
-    #     We passed a callback to AsyncStream(), which appends to 
-    #     self.client_aborted_requests.
 
-    #     As a result, if any requests are canceled from the user side
-    #     the request_id will show up in self.client_aborted_requests.
-    #     """
-
-    #     # Avoid streams having circular ref to parent AsyncLLM object.
-    #     if not self.client_aborted_requests:
-    #         return
-    #     reqs_to_abort = self.client_aborted_requests.copy()
-    #     self.client_aborted_requests.clear()
-
-    #     # Remove from Detokenizer.
-    #     self.detokenizer.abort_requests(reqs_to_abort)
-
-    #     # Remove from RequestStreams.
-    #     for request_id in reqs_to_abort:
-    #         if self.log_requests:
-    #             logger.info("User-cancelled request %s.", request_id)
-    #         self._finish_stream(request_id)
+                yield out
+        
+        # Client request cancellation is handled through calling
+        # task.cancel() on generate. So we abort to alert the Detokenizer
+        # and the EngineCore.
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
-    #     # Remove from EngineCore.
-    #     await self.engine_core.abort_requests_async(reqs_to_abort)
 
-    async def _run_output_handler(self):
+    async def output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
         epoch = 0
@@ -301,16 +276,14 @@ async def _run_output_handler(self):
 
                 self.rid_to_queue[out.request_id].put_nowait(out)
 
-            # 3) Abort any requests that finished due to stop strings.
-            # await self.engine_core.abort_requests_async(reqs_to_abort)
 
-            # 4) Abort any requests due to client cancellations.
-            # TODO: send back to detokenizer if this fails.
-            # await self._process_cancellations()
+    async def abort(self, request_id: str):
 
-    async def abort(self, request_id: str) -> None:
-        # Note: this is not used outside of testing.
-        pass
+        await self.detokenizer.input_socket.send_pyobj(
+            EngineAbortRequest([request_id]))
+        
+        if self.log_requests:
+            logger.info("Aborted %s.", request_id)
 
     def encode(
         self,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5f9674f2569c3..67920688f0031 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -18,8 +18,8 @@
 from vllm.utils import get_open_zmq_ipc_path
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineRequest,
-                            EngineRequestType, EngineRequestUnion,
+                            EngineAbortRequest, EngineRequest,
+                            EngineProfileRequest, EngineRequestUnion,
                             BackgroundProcHandle)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
@@ -92,6 +92,8 @@ def _initialize_kv_caches(self,
     def add_request(self, request: EngineRequest):
         """Add request to the scheduler."""
 
+        logger.debug("Adding request: %s", request.request_id)
+
         if request.mm_hashes is not None:
             # Here, if hash exists for an image, then it will be fetched
             # from the cache, else it will be added to the cache.
@@ -102,16 +104,15 @@ def add_request(self, request: EngineRequest):
             request.mm_inputs = self.mm_input_mapper_server.process_inputs(
                 request.mm_inputs, request.mm_hashes)
 
+        # TODO: instead of sending EngineRequest, should we just send
+        # around Request?
         req = Request.from_engine_core_request(request)
-
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
         """Abort requests from the scheduler."""
 
-        # TODO: The scheduler doesn't really need to know the
-        # specific finish reason, TBD whether we propagate that
-        # (i.e. client-aborted vs stop criteria met).
+        logger.debug("Aborting requests: %s", request_ids)
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 
@@ -294,38 +295,18 @@ def _handle_client_request(self, request: EngineRequestUnion) -> None:
 
         if isinstance(request, EngineRequest):
             self.add_request(request)
-        elif isinstance(request, EngineCoreProfile):
+        elif isinstance(request, EngineProfileRequest):
             self.model_executor.profile(request.is_start)
+        elif isinstance(request, EngineAbortRequest):
+            self.abort_requests(request.request_ids)
         else:
-            # TODO: make an EngineCoreAbort wrapper
-            assert isinstance(request, list)
-            self.abort_requests(request)
+            raise ValueError("Unknown request type: {request}")
 
     def process_input_socket(self, input_path: str):
         """Input socket IO thread."""
 
-        # Msgpack serialization decoding.
-        decoder_add_req = PickleEncoder()
-        decoder_abort_req = PickleEncoder()
-
         with zmq_socket_ctx(input_path, zmq.PULL) as socket:
             while True:
-                # (RequestType, RequestData)
-                # type_frame, data_frame = socket.recv_multipart(copy=False)
-                # request_type = type_frame.buffer
-                # request_data = data_frame.buffer
-                
-
-                # # Deserialize the request data.
-                # if request_type == EngineRequestType.ADD.value:
-                #     request = decoder_add_req.decode(request_data)
-                # elif request_type == EngineRequestType.ABORT.value:
-                #     request = decoder_abort_req.decode(request_data)
-                # elif request_type == EngineRequestType.PROFILE.value:
-                #     request = pickle.loads(request_data)
-                # else:
-                #     raise ValueError(f"Unknown RequestType: {request_type}")
-
                 # Push to input queue for core busy loop.
                 request = socket.recv_pyobj()
                 self.input_queue.put_nowait(request)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b328d7337b18a..07f0ae2c9059f 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -142,9 +142,6 @@ def __init__(
         self._finalizer = weakref.finalize(self, self.shutdown)
 
     def shutdown(self):
-        # Shut down the zmq context.
-        self.ctx.destroy(linger=0)
-
         if hasattr(self, "proc_handle") and self.proc_handle:
             # Shutdown the process if needed.
             if self.proc_handle.proc.is_alive():
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6f03ec57e1105..ab06d811b1c70 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -3,7 +3,7 @@
 import msgspec
 import signal
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Dict, Iterable, List, Optional, Tuple,Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.executor.multiproc_worker_utils import get_mp_context
@@ -15,7 +15,8 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            BackgroundProcHandle, EngineRequest)
+                            BackgroundProcHandle, 
+                            EngineRequest, EngineAbortRequest)
 from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, 
                            wait_for_startup)
 
@@ -234,14 +235,14 @@ def abort_requests(
             self.request_states.pop(request_id, None)        
         
     def step(
-        self, encore_core_outputs: List[EngineCoreOutput]
-    ) -> List[RequestOutput]:
-        """Update state and request the RequestOutputs to the LLMEngine."""
+        self, encore_core_outputs: EngineCoreOutputs,
+    ) -> Tuple[List[RequestOutput], List[str]]:
+        """Update state and make RequestOutputs for the LLMEngine."""
 
         request_outputs: List[RequestOutput] = []
-        # requests_to_abort: List[str] = []
+        requests_to_abort: List[str] = []
 
-        for engine_core_output in encore_core_outputs:
+        for engine_core_output in encore_core_outputs.outputs:
             request_id = engine_core_output.request_id
 
             detokenizer = self.request_states.get(request_id)
@@ -261,17 +262,16 @@ def step(
                 request_outputs.append(request_output)
 
                 # # Free completed requests.
-                # if request_output.finished:
-                #     self.request_states.pop(request_id)
-                #     # If Request finished but EngineCore not finished,
-                #     # this was caused by a stop string + we need to send
-                #     # an abort signal to the EngineCore.
-                #     if not engine_core_output.finished:
-                #         requests_to_abort.append(request_id)
+                if request_output.finished:
+                    self.request_states.pop(request_id)
+                    # If Request finished but EngineCore not finished,
+                    # this was caused by a stop string + we need to send
+                    # an abort signal to the EngineCore.
+                    if not engine_core_output.finished:
+                        requests_to_abort.append(request_id)
 
         # Return to EngineClient.
-        # return request_outputs, requests_to_abort
-        return request_outputs, []
+        return request_outputs, requests_to_abort
 
 class DetokenizerProc(Detokenizer):
     """ZMQ-wrapper for running Detokenizer in background process."""
@@ -374,60 +374,95 @@ def signal_handler(signum, frame):
             if detokenizer is not None:
                 detokenizer = None
 
-    def run_busy_loop(self):
-        """Core busy loop of the Detokenizer."""
+    def _handle_from_llm_engine(
+        self, 
+        from_llm_engine: zmq.Socket,
+        to_engine_core: zmq.Socket,
+    ) -> None:
+        """Handle EngineRequest from the LLMEngine."""
+
+        pickled_req = from_llm_engine.recv()
+        req = pickle.loads(pickled_req)
+
+        # Request added by client, add to RequestStates.
+        if isinstance(req, EngineRequest):
+            if req.request_id in self.request_states:
+                raise ValueError(
+                    f"{req.request_id} already in Request States!")
+
+            # Add to RequestStates.
+            request_state = IncrementalDetokenizer.from_new_request(
+                self.tokenizer, req)
+            self.request_states[req.request_id] = request_state
+
+        # Request aborted by client, delete from RequestStates.
+        elif isinstance(req, EngineAbortRequest):
+            if req.request_id not in self.request_states:
+                # If not found, the request is already completed
+                # and we can safely ignore.
+                pass
+            del self.request_states[req.request_id]
+            
+        else:
+            raise ValueError(f"Unknown type: {req}")
 
-        try:
-            # TODO: handle aborted due to client cancellation
-            # TODO: pickle -> msgpack
-            # TODO: send stop string aborts back to EngineCore directly
+        # Forward to EngineCore.
+        to_engine_core.send(pickled_req)
+    
+    def _handle_from_engine_core(
+        self,
+        from_engine_core: zmq.Socket,
+        to_engine_core: zmq.Socket,
+        to_llm_engine: zmq.Socket,
+        decoder: msgspec.msgpack.Decoder,
+    ) -> None:
+        """Handle Outputs from the EngineCore."""
 
-            decoder_out = msgspec.msgpack.Decoder(EngineCoreOutputs)
+        # Deserialize the EngineOutput (use msgpack for performance).
+        (frame, ) = from_engine_core.recv_multipart(copy=False)
+        outputs: EngineCoreOutputs =  decoder.decode(frame.buffer)
 
-            with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, 
-                  zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine,
-                  zmq_socket_ctx(self.engine_core_inputs_path, zmq.PUSH) as to_engine_core,
-                  zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine):
+        # Detokenize.
+        request_outputs, requests_to_abort = self.step(outputs.outputs)
 
-                # TODO: avoid poll by having both EngineCore
-                # and AsyncLLM send to the same socket (unclear why this 
-                # was not working when I originally tried it)
-                poller = zmq.Poller()
-                poller.register(from_engine_core, zmq.POLLIN)
-                poller.register(from_llm_engine, zmq.POLLIN)
+        # Send request outputs back to LLMEngine.
+        to_llm_engine.send_pyobj(request_outputs)
 
-                epoch = 0
-                while True:
-                    logger.info(f"EPOCH: {epoch}")
+        # Abort requests that finished due to stop strings.
+        to_engine_core.send_pyobj(EngineAbortRequest(requests_to_abort))
+        
 
-                    socks = dict(poller.poll())
+    def run_busy_loop(self):
+        """Core busy loop of the Detokenizer."""
 
-                    # Handle NewRequest.
-                    if from_llm_engine in socks:
-                        pickled_request = from_llm_engine.recv()
-                        request: EngineRequest = pickle.loads(pickled_request)
+        decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
-                        assert (request.request_id not in self.request_states)
+        with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, 
+              zmq_socket_ctx(self.engine_core_inputs_path, zmq.PUSH) as to_engine_core,
+              zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine,
+              zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine):
 
-                        # Add to Detokenizer.
-                        request_state = IncrementalDetokenizer.from_new_request(self.tokenizer, request)
-                        self.request_states[request.request_id] = request_state
+            # TODO(rob): avoid poll by having both EngineCore and 
+            # LLMEngine send to the same socket.
+            poller = zmq.Poller()
+            poller.register(from_engine_core, zmq.POLLIN)
+            poller.register(from_llm_engine, zmq.POLLIN)
 
-                        # Forward to EngineCore.
-                        to_engine_core.send(pickled_request)
+            epoch = 0
+            while True:
+                logger.info(f"EPOCH: {epoch}")
+                socks = dict(poller.poll())
 
-                    # Handle EngineCoreOutput.
-                    if from_engine_core in socks:
-                        epoch += 1
+                # Handle input from LLMEngine.
+                if from_llm_engine in socks:
+                    self._handle_from_llm_engine(
+                        from_llm_engine, to_engine_core)
 
-                        (frame, ) = from_engine_core.recv_multipart(copy=False)
-                        engine_core_outputs = decoder_out.decode(frame.buffer).outputs
-                        request_outputs, _ = self.step(engine_core_outputs)
-                        to_llm_engine.send_pyobj(request_outputs)
-        
-        except Exception as e:
-            logger.error(e)
-            raise e
+                # Handle output from EngineCoreOutput.
+                if from_engine_core in socks:
+                    epoch += 1
+                    self._handle_from_engine_core(
+                        from_engine_core, to_llm_engine, decoder)
 
 class DetokenizerClient:
     
@@ -450,8 +485,7 @@ def __init__(self,
 
         # Get output (RequestOutput) from Detokenizer.
         output_path = get_open_zmq_ipc_path()
-        self.output_socket = make_zmq_socket(
-            self.ctx,
+        self.output_socket = make_zmq_socket(self.ctx,
             output_path,
             zmq.PULL,
         )
@@ -473,13 +507,3 @@ def shutdown(self):
 
         if self.proc_handle.proc.is_alive():
             kill_process_tree(self.proc_handle.proc.pid)
-
-    async def add_request_async(self, request: EngineRequest):
-        """Send new DetokenizerRequest to Detokenizer."""
-
-        await self.input_socket.send_pyobj(request)
-
-    async def get_output_async(self) -> List[RequestOutput]:
-        """Get RequestOutputs, RequestsToAbort from Detokenizer."""
-
-        return await self.output_socket.recv_pyobj()

From 28da5b311033a83064263c60b5b33901a5df1a4f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 20:31:22 +0000
Subject: [PATCH 073/132] updated

---
 vllm/v1/engine/async_llm.py   | 141 +++++++++++++++++++++++-----------
 vllm/v1/engine/core.py        |   2 +-
 vllm/v1/engine/core_client.py |   8 +-
 vllm/v1/engine/detokenizer.py |  16 ++--
 4 files changed, 112 insertions(+), 55 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6e15665df61de..94d542fa096d7 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,5 +1,22 @@
+# Copyright 2033-2024 The vLLM team.
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py
+
 import asyncio
-import fastapi
+import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -43,32 +60,13 @@ def __init__(
     ) -> None:
         assert start_engine_loop
 
-        self.warned = False
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
-        # Tokenizer (+ ensure liveness if running in another process).
-        self.tokenizer = init_tokenizer_from_configs(
-            model_config=vllm_config.model_config,
-            scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
-            lora_config=vllm_config.lora_config)
-        self.tokenizer.ping()
-
         # RequestId -> OutputQueue.
         self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {}
-        # List of cancelled request ids to be aborted.
-        self.client_aborted_requests: List[str] = []
-
-        # Processor (converts Inputs --> EngineRequest).
-        self.processor = Processor(
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            lora_config=vllm_config.lora_config,
-            tokenizer=self.tokenizer,
-            input_registry=input_registry)
 
         # IPC paths.
         engine_core_outputs_path = get_open_zmq_ipc_path()
@@ -93,7 +91,28 @@ def __init__(
             usage_context=usage_context,
         )
 
+        # Tokenizer (+ ensure liveness if running in another process).
+        # Note: make last to avoid fork before using tokenizers
+        # and avoid TOKENIZERS_PARALLELISM issues.
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)
+        self.tokenizer.ping()
+
+        # Processor (converts Inputs --> EngineRequest).
+        self.processor = Processor(
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+            tokenizer=self.tokenizer,
+            input_registry=input_registry)
+
+        # Create output handler loop during first call to generate().
         self.to_create_loop = True
+        self.gracefully_exit = False
+        self.asyncio_tasks = set()
 
     def __del__(self):
         self.shutdown()
@@ -137,9 +156,6 @@ def shutdown(self):
         if detokenizer := getattr(self, "detokenizer", None):
             detokenizer.shutdown()
 
-        if handler := getattr(self, "output_handler", None):
-            handler.cancel()
-
     @classmethod
     def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         executor_class: Type[Executor]
@@ -209,20 +225,9 @@ async def generate(
         """
 
         try:
-            # We start the output_handler on the first call to generate() so that
-            # we can call __init__ before the event loop starts, which enables us
-            # to handle startup failure gracefully in the OpenAI server.
+            # Start output_handler on first request.
             if self.to_create_loop:
-                import signal
-                def signal_handler(self, signum=None, frame=None):
-                    logger.warning(
-                        f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
-                )
-
-                self.to_create_loop = False
-                loop = asyncio.get_event_loop()
-                loop.create_task(self.output_handler())
-                loop.add_signal_handler(signal.SIGTERM, signal_handler)
+                self.create_output_handler()
 
             q = await self.add_request(
                 request_id,
@@ -235,14 +240,14 @@ def signal_handler(self, signum=None, frame=None):
             )
 
             # The output_handler task pushes items into the queue.
-            # This task pulls from the queue and yields them.
+            # This task pulls from the queue and yields to caller.
             while True:
                 # Note: drain queue without await if possible (avoids
-                # task switching under load --> helps performance).
+                # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
                 # Note: both Detokenizer and EngineCore handle their
-                # own cleanup based on finished.
+                # own request cleanup based on finished.
                 if out.finished:
                     del self.rid_to_queue[request_id]
                     yield out
@@ -251,15 +256,47 @@ def signal_handler(self, signum=None, frame=None):
                 yield out
         
         # Client request cancellation is handled through calling
-        # task.cancel() on generate. So we abort to alert the Detokenizer
-        # and the EngineCore.
+        # task.cancel() on generate. So if we get this error, we
+        # need to abort the request.
         except asyncio.CancelledError:
             await self.abort(request_id)
             raise
+    
+    def create_output_handler(self):
+        """Creates output handler loop. Called on first generate()."""
+
+        self.to_create_loop = False
+        loop = asyncio.get_event_loop()
+
+        # Start output handler.
+        self.asyncio_tasks.add(loop.create_task(self.output_handler()))
+
+        # Start signal handlers for shutdown.
+        signal_handler = SignalHandler(self)
+        loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler)
+        self.asyncio_tasks.add(loop.create_task(self.sigterm_watchdog()))
+
+
+    async def sigterm_watchdog(self):
+        """Handle shutdown from sigterm."""
+
+        while not self.gracefully_exit:
+            await asyncio.sleep(5)
+        # Drain requests
+        while True:
+            remain_num_req = len(self.rid_to_state)
+            logger.info(
+                f"Gracefully exiting... remaining number of requests {remain_num_req}"
+            )
+            if remain_num_req > 0:
+                await asyncio.sleep(5)
+            else:
+                break
+        self.shutdown()
 
 
     async def output_handler(self):
-        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+        """Background loop: pulls from Detokenizer and pushes to queues."""
 
         epoch = 0
         while True:
@@ -269,6 +306,7 @@ async def output_handler(self):
             # 1) Pull outputs from the Detokenizer.
             outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj()
 
+            # 2) Put each output into a per request Queue.
             for out in outputs:
                 if out.request_id not in self.rid_to_queue:
                     raise RuntimeError(f"{out.request_id} "
@@ -278,9 +316,14 @@ async def output_handler(self):
 
 
     async def abort(self, request_id: str):
-
+        # Remove from Detokenizer and EngineCore (Detokenizer
+        # forwards the message to EngineCore).
         await self.detokenizer.input_socket.send_pyobj(
             EngineAbortRequest([request_id]))
+
+        # Remove from request output queues.
+        if request_id in self.rid_to_queue:
+            del self.rid_to_queue[request_id]
         
         if self.log_requests:
             logger.info("Aborted %s.", request_id)
@@ -345,3 +388,15 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
+
+
+class SignalHandler:
+    def __init__(self, async_llm):
+        self.async_llm = async_llm
+
+    def signal_handler(self, signum=None, frame=None):
+        logger.warning(
+                "SIGTERM received. signum=%s frame=%s. Draining "
+                "requests and shutting down...", signum, frame,
+        )
+        self.async_llm.gracefully_exit = True
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 67920688f0031..20ed8c93e11df 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -243,7 +243,7 @@ def signal_handler(signum, frame):
 
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
-        
+
         # Loop until process is sent a SIGINT or SIGTERM
         epoch = 0
         while True:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 07f0ae2c9059f..5829ef350f438 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -8,13 +8,9 @@
 
 from vllm.logger import init_logger
 from vllm.utils import kill_process_tree, get_open_zmq_ipc_path
-from vllm.v1.engine import (BackgroundProcHandle,
-                            EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineRequest,
-                            EngineRequestType, EngineRequestUnion)
+from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, 
+                            EngineRequest)
 from vllm.v1.engine.core import (EngineCore, EngineCoreProc)
-from vllm.v1.serial_utils import PickleEncoder
-from vllm.v1.utils import make_zmq_socket
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index ab06d811b1c70..b243e787024e2 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -420,10 +420,10 @@ def _handle_from_engine_core(
 
         # Deserialize the EngineOutput (use msgpack for performance).
         (frame, ) = from_engine_core.recv_multipart(copy=False)
-        outputs: EngineCoreOutputs =  decoder.decode(frame.buffer)
+        outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
 
         # Detokenize.
-        request_outputs, requests_to_abort = self.step(outputs.outputs)
+        request_outputs, requests_to_abort = self.step(outputs)
 
         # Send request outputs back to LLMEngine.
         to_llm_engine.send_pyobj(request_outputs)
@@ -450,19 +450,25 @@ def run_busy_loop(self):
 
             epoch = 0
             while True:
-                logger.info(f"EPOCH: {epoch}")
                 socks = dict(poller.poll())
 
                 # Handle input from LLMEngine.
                 if from_llm_engine in socks:
                     self._handle_from_llm_engine(
-                        from_llm_engine, to_engine_core)
+                        from_llm_engine=from_llm_engine,
+                        to_engine_core=to_engine_core,
+                    )
 
                 # Handle output from EngineCoreOutput.
                 if from_engine_core in socks:
+                    logger.info(f"EPOCH: {epoch}")
                     epoch += 1
                     self._handle_from_engine_core(
-                        from_engine_core, to_llm_engine, decoder)
+                        from_engine_core=from_engine_core,
+                        to_engine_core=to_engine_core,
+                        to_llm_engine=to_llm_engine,
+                        decoder=decoder,
+                    )
 
 class DetokenizerClient:
     

From e14def69c99bb3afbb13963e8ae8a94b93d2acc1 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 20:31:54 +0000
Subject: [PATCH 074/132] updated

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 94d542fa096d7..472777bacfcb7 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -152,7 +152,7 @@ def shutdown(self):
 
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
-        
+
         if detokenizer := getattr(self, "detokenizer", None):
             detokenizer.shutdown()
 

From 074af11ee70ed3d75c54a1bbe5842692f88b53aa Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 20:33:19 +0000
Subject: [PATCH 075/132] updated

---
 vllm/v1/engine/detokenizer.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index b243e787024e2..ef14511754a66 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -241,10 +241,8 @@ def step(
 
         request_outputs: List[RequestOutput] = []
         requests_to_abort: List[str] = []
-
         for engine_core_output in encore_core_outputs.outputs:
             request_id = engine_core_output.request_id
-
             detokenizer = self.request_states.get(request_id)
             if detokenizer is None:
                 # Ignore output for already-aborted request.
@@ -256,12 +254,12 @@ def step(
                 finish_reason=engine_core_output.finish_reason,
                 stop_reason=engine_core_output.stop_reason,
             )
-            
+
             if request_output is not None:
                 # Add to RequestOutputs list.
                 request_outputs.append(request_output)
 
-                # # Free completed requests.
+                # Free completed requests.
                 if request_output.finished:
                     self.request_states.pop(request_id)
                     # If Request finished but EngineCore not finished,

From 3df5288e7c824ad01eaa770b58e4df61e9f4a2e3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 20:34:13 +0000
Subject: [PATCH 076/132] updated

---
 examples/openai_completion_client.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
index 1f8b82bc5c9e9..58519f978d340 100644
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -2,7 +2,7 @@
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8001/v1"
+openai_api_base = "http://localhost:8000/v1"
 
 client = OpenAI(
     # defaults to os.environ.get("OPENAI_API_KEY")
@@ -14,12 +14,14 @@
 model = models.data[0].id
 
 # Completion API
-stream = True
+stream = False
 completion = client.completions.create(
     model=model,
     prompt="A robot may not injure a human being",
     echo=False,
-    stream=stream)
+    n=2,
+    stream=stream,
+    logprobs=3)
 
 print("Completion results:")
 if stream:

From 546b0de8b1454676be80e3b26032019d4e35b8c7 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 20:36:01 +0000
Subject: [PATCH 077/132] updated

---
 vllm/entrypoints/openai/serving_completion.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 2bc0c6d1f1c8f..af5987fcebd6c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -168,8 +168,7 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        # result_generator = merge_async_iterators(*generators)
-        result_generator = generator
+        result_generator = merge_async_iterators(*generators)
 
         model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
@@ -265,8 +264,7 @@ async def completion_stream_generator(
             include_usage, include_continuous_usage = False, False
 
         try:
-            # async for prompt_idx, res in result_generator:
-            async for res in result_generator:
+            async for prompt_idx, res in result_generator:
                 prompt_idx = 0
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs

From c700c4a5ea693fc7baa5a27e212ec053b3e4bc2e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 22 Dec 2024 20:36:22 +0000
Subject: [PATCH 078/132] remove

---
 vllm/entrypoints/openai/serving_completion.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index af5987fcebd6c..aaad7b8c7f44c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -265,7 +265,6 @@ async def completion_stream_generator(
 
         try:
             async for prompt_idx, res in result_generator:
-                prompt_idx = 0
                 prompt_token_ids = res.prompt_token_ids
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt

From 5b568daa617803f7a1429d8b0c39136af69d5b2a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 14:37:10 +0000
Subject: [PATCH 079/132] send messages only when needed

---
 vllm/v1/engine/core.py        | 3 ++-
 vllm/v1/engine/core_client.py | 4 ----
 vllm/v1/engine/detokenizer.py | 4 +++-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 20ed8c93e11df..76cdd027ec319 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -272,7 +272,8 @@ def run_busy_loop(self):
             outputs = self.step()
 
             # 4) Put EngineCoreOutputs into the output queue.
-            self.output_queue.put_nowait(outputs)
+            if len(outputs) > 0:
+                self.output_queue.put_nowait(outputs)
 
             self._log_stats()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 5829ef350f438..181d6e3874d70 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -2,10 +2,6 @@
 import weakref
 from typing import List, Optional
 
-import msgspec
-import zmq
-import zmq.asyncio
-
 from vllm.logger import init_logger
 from vllm.utils import kill_process_tree, get_open_zmq_ipc_path
 from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index ef14511754a66..ff047284c05eb 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -427,7 +427,9 @@ def _handle_from_engine_core(
         to_llm_engine.send_pyobj(request_outputs)
 
         # Abort requests that finished due to stop strings.
-        to_engine_core.send_pyobj(EngineAbortRequest(requests_to_abort))
+        if len(requests_to_abort) > 0:
+            to_engine_core.send_pyobj(
+                EngineAbortRequest(requests_to_abort))
         
 
     def run_busy_loop(self):

From 93c4ea4517e26654d459edaf8d0d9d1744ef4abc Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 15:48:16 +0000
Subject: [PATCH 080/132] added flag for request id headers

---
 vllm/entrypoints/api_server.py        |  4 +++-
 vllm/entrypoints/openai/api_server.py | 22 +++++++++++++++-------
 vllm/entrypoints/openai/cli_args.py   |  6 +++++-
 vllm/utils.py                         | 17 +++++++++++++++++
 vllm/v1/engine/async_llm.py           |  2 +-
 vllm/v1/engine/detokenizer.py         |  4 ++--
 6 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 95da1c6e7b9bf..09983d9561532 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -21,7 +21,7 @@
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
@@ -118,6 +118,8 @@ async def run_server(args: Namespace,
                      **uvicorn_kwargs: Any) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
+    
+    set_ulimit()
 
     app = await init_app(args, llm_engine)
     assert engine is not None
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 73a4dc1c51185..67419accd9b60 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -64,7 +64,7 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address)
+                        is_valid_ipv6_address, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -533,12 +533,17 @@ async def authentication(request: Request, call_next):
                                     status_code=401)
             return await call_next(request)
 
-    # @app.middleware("http")
-    # async def add_request_id(request: Request, call_next):
-    #     request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
-    #     response = await call_next(request)
-    #     response.headers["X-Request-Id"] = request_id
-    #     return response
+    if args.enable_request_id_headers:
+        logger.warning(
+            "CAUTION: Enabling X-Request-Id headers in the API Server. "
+            "This can harm performance at high QPS.")
+
+        @app.middleware("http")
+        async def add_request_id(request: Request, call_next):
+            request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
+            response = await call_next(request)
+            response.headers["X-Request-Id"] = request_id
+            return response
 
     for middleware in args.middleware:
         module_path, object_name = middleware.rsplit(".", 1)
@@ -662,6 +667,9 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     sock_addr = (args.host or "", args.port)
     sock = create_server_socket(sock_addr)
 
+    # workaround to ensure user has enough fds available for uvicorn + ipc
+    set_ulimit()
+
     def signal_handler(*_) -> None:
         # Interrupt server on sigterm while initializing
         raise KeyboardInterrupt("terminated")
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 24c206a1261f2..908f8c3532c9e 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -196,7 +196,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         action="store_true",
         help="If specified, will run the OpenAI frontend server in the same "
         "process as the model serving engine.")
-
+    parser.add_argument(
+        "--enable-request-id-headers",
+        action="store_true",
+        help="If specified, API server will add X-Request-Id header to "
+        "responses. Caution: this hurts performance at high QPS.")
     parser.add_argument(
         "--enable-auto-tool-choice",
         action="store_true",
diff --git a/vllm/utils.py b/vllm/utils.py
index 1b90eca1cd6cc..fd9ca5984c4e0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -10,6 +10,7 @@
 import inspect
 import ipaddress
 import os
+import resource
 import signal
 import socket
 import subprocess
@@ -1613,6 +1614,22 @@ def resolve_obj_by_qualname(qualname: str) -> Any:
     return getattr(module, obj_name)
 
 
+# Taken from https://github.com/sgl-project/sglang/blob/23e5e50fd5fba7f315e04294f55060a8171fcc69/python/sglang/srt/utils.py#L630 # noqa: E501
+def set_ulimit(target_soft_limit=65535):
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+        except ValueError as e:
+            logger.warning(
+                "Found ulimit of %s and failed to automatically increase"
+                "with error %s. This can cause fd limit errors like"
+                "`OSError: [Errno 24] Too many open files`. Consider "
+                "increasing with ulimit -n", current_soft, e)
+
+
 def kill_process_tree(pid: int):
     """
     Kills all descendant processes of the given pid by sending SIGKILL.
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 472777bacfcb7..fc248ccfc9be0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -301,7 +301,7 @@ async def output_handler(self):
         epoch = 0
         while True:
             logger.info(f"EPOCH: {epoch}")
-            epoch+=1
+            epoch += 1
 
             # 1) Pull outputs from the Detokenizer.
             outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj()
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index ff047284c05eb..8e050e4970bf6 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -14,7 +14,7 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
+from vllm.v1.engine import (EngineCoreOutputs,
                             BackgroundProcHandle, 
                             EngineRequest, EngineAbortRequest)
 from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, 
@@ -450,6 +450,7 @@ def run_busy_loop(self):
 
             epoch = 0
             while True:
+                logger.info(f"EPOCH: {epoch}")
                 socks = dict(poller.poll())
 
                 # Handle input from LLMEngine.
@@ -461,7 +462,6 @@ def run_busy_loop(self):
 
                 # Handle output from EngineCoreOutput.
                 if from_engine_core in socks:
-                    logger.info(f"EPOCH: {epoch}")
                     epoch += 1
                     self._handle_from_engine_core(
                         from_engine_core=from_engine_core,

From 548ae691d60420dd8f17c6dcc10127a5378864d2 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 15:49:41 +0000
Subject: [PATCH 081/132] fixed too long line

---
 benchmarks/benchmark_throughput.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 5010bf1988e9b..db7724e1d707c 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -414,6 +414,7 @@ def main(args: argparse.Namespace):
                            for request in requests)
     total_output_tokens = sum(request.expected_output_len
                               for request in requests)
+    total_input_tokens = total_num_tokens - total_output_tokens
     if is_multi_modal:
         print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
               "following metrics are not accurate because image tokens are not"
@@ -422,7 +423,7 @@ def main(args: argparse.Namespace):
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
-          f"{(total_num_tokens - total_output_tokens) / len(requests)} input tokens/req, "
+          f"{total_input_tokens / len(requests)} input tokens/req, "
           f"{(total_output_tokens) / len(requests)} output tokens/req, "
     )
 

From 729df02702f6c5e955cc4ce1c2b113a7e7d0c977 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 15:55:51 +0000
Subject: [PATCH 082/132] updated

---
 benchmarks/benchmark_throughput.py    |  3 +--
 vllm/entrypoints/api_server.py        |  2 +-
 vllm/entrypoints/openai/api_server.py |  3 ++-
 vllm/utils.py                         |  3 ++-
 vllm/v1/engine/__init__.py            | 10 +++++---
 vllm/v1/engine/async_llm.py           | 35 ++++++++++++++-------------
 vllm/v1/engine/core.py                |  2 +-
 vllm/v1/engine/core_client.py         |  2 +-
 vllm/v1/utils.py                      | 11 +++------
 9 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index db7724e1d707c..990548c247822 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -424,8 +424,7 @@ def main(args: argparse.Namespace):
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
           f"{total_input_tokens / len(requests)} input tokens/req, "
-          f"{(total_output_tokens) / len(requests)} output tokens/req, "
-    )
+          f"{(total_output_tokens) / len(requests)} output tokens/req, ")
 
     # Output JSON results if specified
     if args.output_json:
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 09983d9561532..daefbff7e5178 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -118,7 +118,7 @@ async def run_server(args: Namespace,
                      **uvicorn_kwargs: Any) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
-    
+
     set_ulimit()
 
     app = await init_app(args, llm_engine)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 67419accd9b60..2aa666548bec5 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -540,7 +540,8 @@ async def authentication(request: Request, call_next):
 
         @app.middleware("http")
         async def add_request_id(request: Request, call_next):
-            request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
+            request_id = request.headers.get(
+                "X-Request-Id") or uuid.uuid4().hex
             response = await call_next(request)
             response.headers["X-Request-Id"] = request_id
             return response
diff --git a/vllm/utils.py b/vllm/utils.py
index fd9ca5984c4e0..2a8af5ea2d5c5 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1621,7 +1621,8 @@ def set_ulimit(target_soft_limit=65535):
 
     if current_soft < target_soft_limit:
         try:
-            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+            resource.setrlimit(resource_type,
+                               (target_soft_limit, current_hard))
         except ValueError as e:
             logger.warning(
                 "Found ulimit of %s and failed to automatically increase"
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 4e84d763ae9be..7200aa9a208ee 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -32,17 +32,22 @@ class EngineRequest:
     arrival_time: float
     lora_request: Optional[LoRARequest]
 
+
 @dataclass
 class EngineAbortRequest:
 
     request_ids: List[str]
 
+
 @dataclass
 class EngineProfileRequest:
 
     is_start: bool
 
-EngineRequestUnion = Union[EngineRequest, EngineAbortRequest, EngineProfileRequest]
+
+EngineRequestUnion = Union[EngineRequest, EngineAbortRequest,
+                           EngineProfileRequest]
+
 
 class EngineCoreOutput(
         msgspec.Struct,
@@ -68,6 +73,3 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
-
-
-
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index fc248ccfc9be0..81d65aec2892f 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -102,12 +102,11 @@ def __init__(
         self.tokenizer.ping()
 
         # Processor (converts Inputs --> EngineRequest).
-        self.processor = Processor(
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            lora_config=vllm_config.lora_config,
-            tokenizer=self.tokenizer,
-            input_registry=input_registry)
+        self.processor = Processor(model_config=vllm_config.model_config,
+                                   cache_config=vllm_config.cache_config,
+                                   lora_config=vllm_config.lora_config,
+                                   tokenizer=self.tokenizer,
+                                   input_registry=input_registry)
 
         # Create output handler loop during first call to generate().
         self.to_create_loop = True
@@ -187,7 +186,7 @@ async def add_request(
         engine_request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
-        
+
         # 2) Create Queue (output_handler() pushes, generate() pulls)
         self.rid_to_queue[request_id] = asyncio.Queue()
 
@@ -254,14 +253,14 @@ async def generate(
                     break
 
                 yield out
-        
+
         # Client request cancellation is handled through calling
         # task.cancel() on generate. So if we get this error, we
         # need to abort the request.
         except asyncio.CancelledError:
             await self.abort(request_id)
             raise
-    
+
     def create_output_handler(self):
         """Creates output handler loop. Called on first generate()."""
 
@@ -276,7 +275,6 @@ def create_output_handler(self):
         loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler)
         self.asyncio_tasks.add(loop.create_task(self.sigterm_watchdog()))
 
-
     async def sigterm_watchdog(self):
         """Handle shutdown from sigterm."""
 
@@ -294,7 +292,6 @@ async def sigterm_watchdog(self):
                 break
         self.shutdown()
 
-
     async def output_handler(self):
         """Background loop: pulls from Detokenizer and pushes to queues."""
 
@@ -304,17 +301,18 @@ async def output_handler(self):
             epoch += 1
 
             # 1) Pull outputs from the Detokenizer.
-            outputs: List[RequestOutput] = await self.detokenizer.output_socket.recv_pyobj()
+            outputs: List[
+                RequestOutput] = await self.detokenizer.output_socket.recv_pyobj(
+                )
 
             # 2) Put each output into a per request Queue.
             for out in outputs:
                 if out.request_id not in self.rid_to_queue:
                     raise RuntimeError(f"{out.request_id} "
-                                        "not in RequestStates")
+                                       "not in RequestStates")
 
                 self.rid_to_queue[out.request_id].put_nowait(out)
 
-
     async def abort(self, request_id: str):
         # Remove from Detokenizer and EngineCore (Detokenizer
         # forwards the message to EngineCore).
@@ -324,7 +322,7 @@ async def abort(self, request_id: str):
         # Remove from request output queues.
         if request_id in self.rid_to_queue:
             del self.rid_to_queue[request_id]
-        
+
         if self.log_requests:
             logger.info("Aborted %s.", request_id)
 
@@ -391,12 +389,15 @@ def dead_error(self) -> BaseException:
 
 
 class SignalHandler:
+
     def __init__(self, async_llm):
         self.async_llm = async_llm
 
     def signal_handler(self, signum=None, frame=None):
         logger.warning(
-                "SIGTERM received. signum=%s frame=%s. Draining "
-                "requests and shutting down...", signum, frame,
+            "SIGTERM received. signum=%s frame=%s. Draining "
+            "requests and shutting down...",
+            signum,
+            frame,
         )
         self.async_llm.gracefully_exit = True
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 76cdd027ec319..e2970ac9cfa70 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -325,4 +325,4 @@ def process_output_socket(self, output_path: str):
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
                 encoder.encode_into(outputs, buffer)
-                socket.send_multipart((buffer,), copy=False)
+                socket.send_multipart((buffer, ), copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 181d6e3874d70..ba8b4c203801f 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -4,7 +4,7 @@
 
 from vllm.logger import init_logger
 from vllm.utils import kill_process_tree, get_open_zmq_ipc_path
-from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput, 
+from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput,
                             EngineRequest)
 from vllm.v1.engine.core import (EngineCore, EngineCoreProc)
 
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index b141110e8c2ab..492c9094f8307 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -80,18 +80,15 @@ def __len__(self):
         return len(self._x)
 
 
-def make_zmq_socket(
-        ctx: Union[zmq.asyncio.Context, zmq.Context],
-        path: str,
-        type: Any
-    ) -> Union[zmq.Socket, zmq.asyncio.Socket]:
+def make_zmq_socket(ctx: Union[zmq.asyncio.Context, zmq.Context], path: str,
+                    type: Any) -> Union[zmq.Socket, zmq.asyncio.Socket]:
     """Make a ZMQ socket with the proper bind/connext semantics."""
 
     import psutil
     mem = psutil.virtual_memory()
 
     socket = ctx.socket(type)
-    
+
     total_mem = mem.total / 1024**3
     available_mem = mem.available / 1024**3
     if total_mem > 32 and available_mem > 16:
@@ -112,6 +109,7 @@ def make_zmq_socket(
 
     return socket
 
+
 @contextmanager
 def zmq_socket_ctx(
         path: str,
@@ -151,4 +149,3 @@ def wait_for_startup(
         except BaseException as e:
             logger.exception(e)
             raise e
-

From 6ec9dcb122728117e952d003ab06d3e0df1d168b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 15:57:50 +0000
Subject: [PATCH 083/132] updated

---
 vllm/v1/engine/async_llm.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 81d65aec2892f..9d3606f0d7229 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -68,6 +68,21 @@ def __init__(
         # RequestId -> OutputQueue.
         self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {}
 
+        # Tokenizer (+ ensure liveness if running in another process).
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)
+        self.tokenizer.ping()
+
+        # Processor (converts Inputs --> EngineRequest).
+        self.processor = Processor(model_config=vllm_config.model_config,
+                                   cache_config=vllm_config.cache_config,
+                                   lora_config=vllm_config.lora_config,
+                                   tokenizer=self.tokenizer,
+                                   input_registry=input_registry)
+
         # IPC paths.
         engine_core_outputs_path = get_open_zmq_ipc_path()
         engine_core_inputs_path = get_open_zmq_ipc_path()
@@ -91,23 +106,6 @@ def __init__(
             usage_context=usage_context,
         )
 
-        # Tokenizer (+ ensure liveness if running in another process).
-        # Note: make last to avoid fork before using tokenizers
-        # and avoid TOKENIZERS_PARALLELISM issues.
-        self.tokenizer = init_tokenizer_from_configs(
-            model_config=vllm_config.model_config,
-            scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
-            lora_config=vllm_config.lora_config)
-        self.tokenizer.ping()
-
-        # Processor (converts Inputs --> EngineRequest).
-        self.processor = Processor(model_config=vllm_config.model_config,
-                                   cache_config=vllm_config.cache_config,
-                                   lora_config=vllm_config.lora_config,
-                                   tokenizer=self.tokenizer,
-                                   input_registry=input_registry)
-
         # Create output handler loop during first call to generate().
         self.to_create_loop = True
         self.gracefully_exit = False

From fc6a20d786a045b73e9e9a24a256374ddb06cf1e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 16:05:36 +0000
Subject: [PATCH 084/132] make pr smaller

---
 vllm/v1/engine/core.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e2970ac9cfa70..8623612c6264d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -92,8 +92,6 @@ def _initialize_kv_caches(self,
     def add_request(self, request: EngineRequest):
         """Add request to the scheduler."""
 
-        logger.debug("Adding request: %s", request.request_id)
-
         if request.mm_hashes is not None:
             # Here, if hash exists for an image, then it will be fetched
             # from the cache, else it will be added to the cache.
@@ -104,15 +102,12 @@ def add_request(self, request: EngineRequest):
             request.mm_inputs = self.mm_input_mapper_server.process_inputs(
                 request.mm_inputs, request.mm_hashes)
 
-        # TODO: instead of sending EngineRequest, should we just send
-        # around Request?
         req = Request.from_engine_core_request(request)
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
         """Abort requests from the scheduler."""
 
-        logger.debug("Aborting requests: %s", request_ids)
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 

From 930ccc2ff19ebb606673564384b3784c36eefd54 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 16:06:37 +0000
Subject: [PATCH 085/132] update logging timing

---
 vllm/v1/engine/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8623612c6264d..2f864a1a4a334 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -32,7 +32,7 @@
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 1
+LOGGING_TIME_S = 5
 
 
 class EngineCore:

From 52d370ff5a87c9773165ef8a5c84d37d91b01342 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 16:11:10 +0000
Subject: [PATCH 086/132] cleanup nits

---
 vllm/v1/engine/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 7200aa9a208ee..2fec90c2fec6c 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,4 +1,3 @@
-import enum
 from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
 from typing import List, Optional, Union
@@ -7,7 +6,7 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sampling_params import SamplingParams
 
 
 @dataclass

From 8939e2e6d4027d81e946726afad1deb5c6f1491d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 16:11:48 +0000
Subject: [PATCH 087/132] cleanup

---
 vllm/v1/engine/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 2fec90c2fec6c..ae10e6f3b8c29 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -19,7 +19,6 @@ class BackgroundProcHandle:
 
 @dataclass
 class EngineRequest:
-
     request_id: str
     prompt: Optional[str]
     prompt_token_ids: List[int]
@@ -34,13 +33,11 @@ class EngineRequest:
 
 @dataclass
 class EngineAbortRequest:
-
     request_ids: List[str]
 
 
 @dataclass
 class EngineProfileRequest:
-
     is_start: bool
 
 

From c2c2e570f3623327edbc22b57c5d46ac2dd5245d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 16:31:22 +0000
Subject: [PATCH 088/132] add sigquit handlers for shutdown

---
 vllm/utils.py                 |  8 ++++++-
 vllm/v1/engine/async_llm.py   | 26 ++++++++++++++--------
 vllm/v1/engine/core.py        | 14 +++++++-----
 vllm/v1/engine/detokenizer.py | 41 ++++++++++++++++++++---------------
 4 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 2a8af5ea2d5c5..caed96d200bfc 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -16,6 +16,7 @@
 import subprocess
 import sys
 import tempfile
+import traceback
 import threading
 import time
 import uuid
@@ -1614,7 +1615,6 @@ def resolve_obj_by_qualname(qualname: str) -> Any:
     return getattr(module, obj_name)
 
 
-# Taken from https://github.com/sgl-project/sglang/blob/23e5e50fd5fba7f315e04294f55060a8171fcc69/python/sglang/srt/utils.py#L630 # noqa: E501
 def set_ulimit(target_soft_limit=65535):
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -1631,6 +1631,12 @@ def set_ulimit(target_soft_limit=65535):
                 "increasing with ulimit -n", current_soft, e)
 
 
+def get_exception_traceback():
+    etype, value, tb = sys.exc_info()
+    err_str = "".join(traceback.format_exception(etype, value, tb))
+    return err_str
+
+
 def kill_process_tree(pid: int):
     """
     Kills all descendant processes of the given pid by sending SIGKILL.
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 9d3606f0d7229..230eca6a64e87 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -15,6 +15,7 @@
 
 # Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py
 
+import os
 import asyncio
 import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
@@ -34,7 +35,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import get_open_zmq_ipc_path
+from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
 from vllm.v1.engine import EngineAbortRequest
 from vllm.v1.engine.core_client import MultiprocessEngineCore
 from vllm.v1.engine.detokenizer import DetokenizerClient
@@ -65,6 +66,13 @@ def __init__(
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
+        # Register the signal handler.
+        # The child processes will send SIGQUIT to this process when
+        # any error happens. This process then clean up the whole tree.
+        def sigquit_handler(signum, frame):
+            kill_process_tree(os.getpid())
+        signal.signal(signal.SIGQUIT, sigquit_handler)
+
         # RequestId -> OutputQueue.
         self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {}
 
@@ -84,23 +92,23 @@ def __init__(
                                    input_registry=input_registry)
 
         # IPC paths.
-        engine_core_outputs_path = get_open_zmq_ipc_path()
-        engine_core_inputs_path = get_open_zmq_ipc_path()
+        from_engine_core_path = get_open_zmq_ipc_path()
+        to_engine_core_path = get_open_zmq_ipc_path()
 
-        # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
+        # Detokenizer (background process).
         self.detokenizer = DetokenizerClient(
-            engine_core_outputs_path=engine_core_outputs_path,
-            engine_core_inputs_path=engine_core_inputs_path,
+            from_engine_core_path=from_engine_core_path,
+            to_engine_core_path=to_engine_core_path,
             tokenizer_name=vllm_config.model_config.tokenizer,
             tokenizer_mode=vllm_config.model_config.tokenizer_mode,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
             revision=vllm_config.model_config.tokenizer_revision,
         )
 
-        # EngineCore (starts the engine in background process).
+        # EngineCore (background process).
         self.engine_core = MultiprocessEngineCore(
-            input_path=engine_core_inputs_path,
-            output_path=engine_core_outputs_path,
+            input_path=to_engine_core_path,
+            output_path=from_engine_core_path,
             vllm_config=vllm_config,
             executor_class=executor_class,
             usage_context=usage_context,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 2f864a1a4a334..0a4069cdbb082 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,4 +1,4 @@
-import pickle
+import psutil
 import queue
 import signal
 import threading
@@ -15,7 +15,7 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import get_open_zmq_ipc_path
+from vllm.utils import get_open_zmq_ipc_path, get_exception_traceback
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineAbortRequest, EngineRequest,
@@ -24,7 +24,6 @@
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.serial_utils import PickleEncoder
 from vllm.v1.utils import zmq_socket_ctx, wait_for_startup
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -219,6 +218,8 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
+        parent_process = psutil.Process().parent()
+
         engine_core = None
         try:
             engine_core = EngineCoreProc(*args, **kwargs)
@@ -227,9 +228,10 @@ def signal_handler(signum, frame):
         except SystemExit:
             logger.debug("EngineCore interrupted.")
 
-        except BaseException as e:
-            logger.exception(e)
-            raise e
+        except Exception:
+            traceback = get_exception_traceback()
+            logger.error(f"EngineCore hit an exception: {traceback}")
+            parent_process.send_signal(signal.SIGQUIT)
 
         finally:
             if engine_core is not None:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 8e050e4970bf6..64a117f40e22f 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,3 +1,4 @@
+import psutil
 import pickle
 import zmq.asyncio
 import msgspec
@@ -13,7 +14,8 @@
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
+from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
+                        get_exception_traceback)
 from vllm.v1.engine import (EngineCoreOutputs,
                             BackgroundProcHandle, 
                             EngineRequest, EngineAbortRequest)
@@ -279,8 +281,8 @@ class DetokenizerProc(Detokenizer):
     def __init__(
         self,
         *args,
-        engine_core_outputs_path: str,
-        engine_core_inputs_path: str,
+        from_engine_core_path: str,
+        to_engine_core_path: str,
         input_path: str,
         output_path: str,
         ready_path: str,
@@ -288,8 +290,8 @@ def __init__(
     ):
         super().__init__(*args, **kwargs)
 
-        self.engine_core_outputs_path = engine_core_outputs_path
-        self.engine_core_inputs_path = engine_core_inputs_path
+        self.from_engine_core_path = from_engine_core_path
+        self.to_engine_core_path = to_engine_core_path
         self.input_path = input_path
         self.output_path = output_path
 
@@ -300,8 +302,8 @@ def __init__(
 
     @staticmethod
     def make_detokenizer_process(
-        engine_core_outputs_path: str,
-        engine_core_inputs_path: str,
+        from_engine_core_path: str,
+        to_engine_core_path: str,
         input_path: str,
         output_path: str,
         tokenizer_name: str,
@@ -313,8 +315,8 @@ def make_detokenizer_process(
         ready_path = get_open_zmq_ipc_path()
 
         process_kwargs = {
-            "engine_core_outputs_path": engine_core_outputs_path,
-            "engine_core_inputs_path": engine_core_inputs_path,
+            "from_engine_core_path": from_engine_core_path,
+            "to_engine_core_path": to_engine_core_path,
             "input_path": input_path,
             "output_path": output_path,
             "ready_path": ready_path,
@@ -356,6 +358,8 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
+        parent_process = psutil.Process().parent()
+
         detokenizer = None
         try:
             detokenizer = DetokenizerProc(*args, **kwargs)
@@ -364,9 +368,10 @@ def signal_handler(signum, frame):
         except SystemExit:
             logger.debug("Detokenizer interrupted.")
 
-        except BaseException as e:
-            logger.exception(e)
-            raise e
+        except Exception:
+            traceback = get_exception_traceback()
+            logger.error(f"Detokenizer hit an exception: {traceback}")
+            parent_process.send_signal(signal.SIGQUIT)
 
         finally:
             if detokenizer is not None:
@@ -437,8 +442,8 @@ def run_busy_loop(self):
 
         decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
-        with (zmq_socket_ctx(self.engine_core_outputs_path, zmq.PULL) as from_engine_core, 
-              zmq_socket_ctx(self.engine_core_inputs_path, zmq.PUSH) as to_engine_core,
+        with (zmq_socket_ctx(self.from_engine_core_path, zmq.PULL) as from_engine_core, 
+              zmq_socket_ctx(self.to_engine_core_path, zmq.PUSH) as to_engine_core,
               zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine,
               zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine):
 
@@ -474,8 +479,8 @@ class DetokenizerClient:
     
     def __init__(self,
                  *args,
-                 engine_core_outputs_path: str,
-                 engine_core_inputs_path: str,
+                 from_engine_core_path: str,
+                 to_engine_core_path: str,
                  **kwargs):
         
         # ZMQ setup.
@@ -500,8 +505,8 @@ def __init__(self,
         self.proc_handle: Optional[BackgroundProcHandle]
         self.proc_handle = DetokenizerProc.make_detokenizer_process(
             *args,
-            engine_core_outputs_path=engine_core_outputs_path,
-            engine_core_inputs_path=engine_core_inputs_path,
+            from_engine_core_path=from_engine_core_path,
+            to_engine_core_path=to_engine_core_path,
             input_path=input_path,
             output_path=output_path,
             **kwargs,

From 51b498df94c208a3e614025df979d0e3eb74c61a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 16:32:34 +0000
Subject: [PATCH 089/132] updated

---
 vllm/v1/engine/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0a4069cdbb082..5819e67a24803 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -230,7 +230,7 @@ def signal_handler(signum, frame):
 
         except Exception:
             traceback = get_exception_traceback()
-            logger.error(f"EngineCore hit an exception: {traceback}")
+            logger.error("EngineCore hit an exception: %s", traceback)
             parent_process.send_signal(signal.SIGQUIT)
 
         finally:

From 84c08b14be993510179b49dfa11bd8287c8d9329 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 17:01:06 +0000
Subject: [PATCH 090/132] signifcantly better error handling

---
 vllm/entrypoints/openai/api_server.py | 12 +++++++++---
 vllm/v1/engine/async_llm.py           |  9 +--------
 vllm/v1/engine/core.py                |  4 +++-
 vllm/v1/engine/detokenizer.py         |  2 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2aa666548bec5..28542c804bbae 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -64,7 +64,7 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, set_ulimit)
+                        is_valid_ipv6_address, set_ulimit, kill_process_tree)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -671,11 +671,17 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # workaround to ensure user has enough fds available for uvicorn + ipc
     set_ulimit()
 
-    def signal_handler(*_) -> None:
+    def sigterm_handler(*_) -> None:
         # Interrupt server on sigterm while initializing
         raise KeyboardInterrupt("terminated")
+    signal.signal(signal.SIGTERM, sigterm_handler)
+
+    # The child processes will send SIGQUIT to this process when
+    # any error happens. This process then clean up the whole tree.
+    def sigquit_handler(signum, frame):
+        kill_process_tree(os.getpid())
+    signal.signal(signal.SIGQUIT, sigquit_handler)
 
-    signal.signal(signal.SIGTERM, signal_handler)
 
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 230eca6a64e87..9ed77ca8a491a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -60,19 +60,12 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
         assert start_engine_loop
-
+        
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
-        # Register the signal handler.
-        # The child processes will send SIGQUIT to this process when
-        # any error happens. This process then clean up the whole tree.
-        def sigquit_handler(signum, frame):
-            kill_process_tree(os.getpid())
-        signal.signal(signal.SIGQUIT, sigquit_handler)
-
         # RequestId -> OutputQueue.
         self.rid_to_queue: Dict[str, asyncio.Queue[RequestOutput]] = {}
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5819e67a24803..bf8bf63951aa7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -246,6 +246,8 @@ def run_busy_loop(self):
         while True:
             logger.info(f"EPOCH: {epoch}")
             epoch += 1
+            if epoch == 10:
+                raise ValueError("Died")
 
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
@@ -260,7 +262,7 @@ def run_busy_loop(self):
                     except BaseException:
                         raise
 
-            # 2) Handle any new client requests (Abort or Add).
+            # 2) Handle any new inputs.
             while not self.input_queue.empty():
                 req = self.input_queue.get_nowait()
                 self._handle_client_request(req)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 64a117f40e22f..e71ce00b3cc5f 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -456,7 +456,7 @@ def run_busy_loop(self):
             epoch = 0
             while True:
                 logger.info(f"EPOCH: {epoch}")
-                socks = dict(poller.poll())
+                socks = dict(poller.poll(timeout=1000))
 
                 # Handle input from LLMEngine.
                 if from_llm_engine in socks:

From 91aceba10f4321d955a959875344d03538d1602d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 18:28:31 +0000
Subject: [PATCH 091/132] proper shutdown of output loop

---
 vllm/v1/engine/async_llm.py   | 117 +++++++++++-----------------------
 vllm/v1/engine/core.py        |  58 +++++++++++++++--
 vllm/v1/engine/detokenizer.py |   4 +-
 3 files changed, 94 insertions(+), 85 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 9ed77ca8a491a..f909d59dd3e5b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -15,9 +15,7 @@
 
 # Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py
 
-import os
 import asyncio
-import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -35,10 +33,10 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
+from vllm.utils import get_open_zmq_ipc_path
 from vllm.v1.engine import EngineAbortRequest
-from vllm.v1.engine.core_client import MultiprocessEngineCore
-from vllm.v1.engine.detokenizer import DetokenizerClient
+from vllm.v1.engine.core import MPEngineCoreClient
+from vllm.v1.engine.detokenizer import MPDetokenizerClient
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 
@@ -89,7 +87,7 @@ def __init__(
         to_engine_core_path = get_open_zmq_ipc_path()
 
         # Detokenizer (background process).
-        self.detokenizer = DetokenizerClient(
+        self.detokenizer = MPDetokenizerClient(
             from_engine_core_path=from_engine_core_path,
             to_engine_core_path=to_engine_core_path,
             tokenizer_name=vllm_config.model_config.tokenizer,
@@ -99,7 +97,7 @@ def __init__(
         )
 
         # EngineCore (background process).
-        self.engine_core = MultiprocessEngineCore(
+        self.engine_core = MPEngineCoreClient(
             input_path=to_engine_core_path,
             output_path=from_engine_core_path,
             vllm_config=vllm_config,
@@ -107,9 +105,7 @@ def __init__(
             usage_context=usage_context,
         )
 
-        # Create output handler loop during first call to generate().
-        self.to_create_loop = True
-        self.gracefully_exit = False
+        self.output_handler: Optional[asyncio.Task] = None
         self.asyncio_tasks = set()
 
     def __del__(self):
@@ -148,6 +144,9 @@ def from_engine_args(
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
+        if output_handler := getattr(self, "output_hander", None):
+            output_handler.cancel()
+
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
@@ -224,9 +223,13 @@ async def generate(
 
         try:
             # Start output_handler on first request.
-            if self.to_create_loop:
-                self.create_output_handler()
+            if not self.output_handler:
+                loop = asyncio.get_event_loop()
+                self.output_handler = loop.create_task(
+                    self.output_handler_loop())
 
+            # Add to Detokenizer and EngineCore and makes queue
+            # to which the output_handler will push RequestOutputs.
             q = await self.add_request(
                 request_id,
                 prompt,
@@ -260,61 +263,32 @@ async def generate(
             await self.abort(request_id)
             raise
 
-    def create_output_handler(self):
-        """Creates output handler loop. Called on first generate()."""
-
-        self.to_create_loop = False
-        loop = asyncio.get_event_loop()
-
-        # Start output handler.
-        self.asyncio_tasks.add(loop.create_task(self.output_handler()))
-
-        # Start signal handlers for shutdown.
-        signal_handler = SignalHandler(self)
-        loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler)
-        self.asyncio_tasks.add(loop.create_task(self.sigterm_watchdog()))
-
-    async def sigterm_watchdog(self):
-        """Handle shutdown from sigterm."""
-
-        while not self.gracefully_exit:
-            await asyncio.sleep(5)
-        # Drain requests
-        while True:
-            remain_num_req = len(self.rid_to_state)
-            logger.info(
-                f"Gracefully exiting... remaining number of requests {remain_num_req}"
-            )
-            if remain_num_req > 0:
-                await asyncio.sleep(5)
-            else:
-                break
-        self.shutdown()
-
-    async def output_handler(self):
-        """Background loop: pulls from Detokenizer and pushes to queues."""
-
-        epoch = 0
-        while True:
-            logger.info(f"EPOCH: {epoch}")
-            epoch += 1
-
-            # 1) Pull outputs from the Detokenizer.
-            outputs: List[
-                RequestOutput] = await self.detokenizer.output_socket.recv_pyobj(
-                )
-
-            # 2) Put each output into a per request Queue.
-            for out in outputs:
-                if out.request_id not in self.rid_to_queue:
-                    raise RuntimeError(f"{out.request_id} "
-                                       "not in RequestStates")
+    async def output_handler_loop(self):
+        """Background loop: pulls from Detokenizer and push to Queues."""
 
-                self.rid_to_queue[out.request_id].put_nowait(out)
+        try:
+            while True:
+                # Note: use socket directly to avoid calling await multiple
+                # times, which causes too much task switching at high QPS.
+                outputs: List[RequestOutput] = [] 
+                outputs = await self.detokenizer.output_socket.recv_pyobj()
+
+                for out in outputs:
+                    # Note: it is possible that a request was aborted
+                    # due to client cancellation while EngineCoreOutputs
+                    # are still flowing, so we just ignore.
+                    if out.request_id in self.rid_to_queue:
+                        self.rid_to_queue[out.request_id].put_nowait(out)
+        
+        except asyncio.CancelledError:
+            logger.info("Shutting down output_handler_loop")
+            raise
+            
 
     async def abort(self, request_id: str):
-        # Remove from Detokenizer and EngineCore (Detokenizer
-        # forwards the message to EngineCore).
+        """Abort request if the client cancels the request."""
+
+        # Send abort to Detokenizer (which will fwd to EngineCore)
         await self.detokenizer.input_socket.send_pyobj(
             EngineAbortRequest([request_id]))
 
@@ -385,18 +359,3 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
-
-
-class SignalHandler:
-
-    def __init__(self, async_llm):
-        self.async_llm = async_llm
-
-    def signal_handler(self, signum=None, frame=None):
-        logger.warning(
-            "SIGTERM received. signum=%s frame=%s. Draining "
-            "requests and shutting down...",
-            signum,
-            frame,
-        )
-        self.async_llm.gracefully_exit = True
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index bf8bf63951aa7..e9557064ef900 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,7 +3,9 @@
 import signal
 import threading
 import time
-from typing import List, Tuple, Type
+import os
+import weakref
+from typing import List, Optional, Tuple, Type
 
 import zmq
 import zmq.asyncio
@@ -15,7 +17,8 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import get_open_zmq_ipc_path, get_exception_traceback
+from vllm.utils import (get_open_zmq_ipc_path, get_exception_traceback,
+                        kill_process_tree)
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineAbortRequest, EngineRequest,
@@ -246,8 +249,8 @@ def run_busy_loop(self):
         while True:
             logger.info(f"EPOCH: {epoch}")
             epoch += 1
-            if epoch == 10:
-                raise ValueError("Died")
+            # if epoch == 10:
+            #     raise ValueError("Died")
 
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
@@ -325,3 +328,50 @@ def process_output_socket(self, output_path: str):
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
                 encoder.encode_into(outputs, buffer)
                 socket.send_multipart((buffer, ), copy=False)
+
+
+class MPEngineCoreClient:
+    """
+    MPEngineCoreClient: client for multi-proc EngineCore.
+        EngineCore runs in a background process busy loop, getting
+        new EngineRequests and returning EngineCoreOutputs
+
+        * pushes EngineRequests via input_socket
+        * pulls EngineCoreOutputs via output_socket
+    """
+
+    def __init__(self, *args, input_path: str, output_path: str, **kwargs):
+        # Start EngineCore in background process.
+        self.proc_handle: Optional[BackgroundProcHandle]
+        self.proc_handle = EngineCoreProc.make_engine_core_process(
+            *args,
+            input_path=input_path,
+            output_path=output_path,
+            **kwargs,
+        )
+        self._finalizer = weakref.finalize(self, self.shutdown)
+
+    def shutdown(self):
+        if hasattr(self, "proc_handle") and self.proc_handle:
+            # Shutdown the process if needed.
+            if self.proc_handle.proc.is_alive():
+                self.proc_handle.proc.terminate()
+                self.proc_handle.proc.join(5)
+
+                if self.proc_handle.proc.is_alive():
+                    kill_process_tree(self.proc_handle.proc.pid)
+
+            # Remove zmq ipc socket files
+            ipc_sockets = [
+                self.proc_handle.ready_path,
+                self.proc_handle.output_path,
+                self.proc_handle.input_path
+            ]
+            for ipc_socket in ipc_sockets:
+                socket_file = ipc_socket.replace("ipc://", "")
+                if os and os.path.exists(socket_file):
+                    os.remove(socket_file)
+            self.proc_handle = None
+
+    def __del__(self):
+        self.shutdown()
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index e71ce00b3cc5f..f8b388f9ff9d0 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -456,7 +456,7 @@ def run_busy_loop(self):
             epoch = 0
             while True:
                 logger.info(f"EPOCH: {epoch}")
-                socks = dict(poller.poll(timeout=1000))
+                socks = dict(poller.poll())
 
                 # Handle input from LLMEngine.
                 if from_llm_engine in socks:
@@ -475,7 +475,7 @@ def run_busy_loop(self):
                         decoder=decoder,
                     )
 
-class DetokenizerClient:
+class MPDetokenizerClient:
     
     def __init__(self,
                  *args,

From 87e7ebd9937b2c2990381f13af6ce7124ceb4285 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 18:29:42 +0000
Subject: [PATCH 092/132] update comment

---
 vllm/v1/engine/async_llm.py | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index f909d59dd3e5b..441eab9adaf12 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -257,8 +257,7 @@ async def generate(
                 yield out
 
         # Client request cancellation is handled through calling
-        # task.cancel() on generate. So if we get this error, we
-        # need to abort the request.
+        # task.cancel() on generate. So we call abort if canceled.
         except asyncio.CancelledError:
             await self.abort(request_id)
             raise
@@ -266,23 +265,18 @@ async def generate(
     async def output_handler_loop(self):
         """Background loop: pulls from Detokenizer and push to Queues."""
 
-        try:
-            while True:
-                # Note: use socket directly to avoid calling await multiple
-                # times, which causes too much task switching at high QPS.
-                outputs: List[RequestOutput] = [] 
-                outputs = await self.detokenizer.output_socket.recv_pyobj()
-
-                for out in outputs:
-                    # Note: it is possible that a request was aborted
-                    # due to client cancellation while EngineCoreOutputs
-                    # are still flowing, so we just ignore.
-                    if out.request_id in self.rid_to_queue:
-                        self.rid_to_queue[out.request_id].put_nowait(out)
-        
-        except asyncio.CancelledError:
-            logger.info("Shutting down output_handler_loop")
-            raise
+        while True:
+            # Note: use socket directly to avoid calling await multiple
+            # times, which causes too much task switching at high QPS.
+            outputs: List[RequestOutput] = [] 
+            outputs = await self.detokenizer.output_socket.recv_pyobj()
+
+            for out in outputs:
+                # Note: it is possible that a request was aborted
+                # due to client cancellation while EngineCoreOutputs
+                # are still flowing, so we just ignore.
+                if out.request_id in self.rid_to_queue:
+                    self.rid_to_queue[out.request_id].put_nowait(out)
             
 
     async def abort(self, request_id: str):

From 2e3257c136e9d29b2d7802b126a72e5c63909f3a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 18:33:24 +0000
Subject: [PATCH 093/132] updated

---
 vllm/v1/engine/detokenizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index f8b388f9ff9d0..8320d3260954a 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -456,6 +456,7 @@ def run_busy_loop(self):
             epoch = 0
             while True:
                 logger.info(f"EPOCH: {epoch}")
+
                 socks = dict(poller.poll())
 
                 # Handle input from LLMEngine.

From 3b13d89644fd3604c3adcc90ea31eb10c39460e5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 18:59:03 +0000
Subject: [PATCH 094/132] support in LLMEngine

---
 vllm/v1/engine/async_llm.py   |   7 +-
 vllm/v1/engine/core.py        |   1 -
 vllm/v1/engine/core_client.py | 158 ----------------------------------
 vllm/v1/engine/llm_engine.py  |  92 ++++++++++++--------
 4 files changed, 58 insertions(+), 200 deletions(-)
 delete mode 100644 vllm/v1/engine/core_client.py

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 441eab9adaf12..586db9ef804c0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -106,7 +106,6 @@ def __init__(
         )
 
         self.output_handler: Optional[asyncio.Task] = None
-        self.asyncio_tasks = set()
 
     def __del__(self):
         self.shutdown()
@@ -257,18 +256,20 @@ async def generate(
                 yield out
 
         # Client request cancellation is handled through calling
-        # task.cancel() on generate. So we call abort if canceled.
+        # task.cancel() on generate(). Calling self.abort() forwards the 
+        # cancellation to the EngineCore and Detokenizer.
         except asyncio.CancelledError:
             await self.abort(request_id)
             raise
 
+
     async def output_handler_loop(self):
         """Background loop: pulls from Detokenizer and push to Queues."""
 
         while True:
             # Note: use socket directly to avoid calling await multiple
             # times, which causes too much task switching at high QPS.
-            outputs: List[RequestOutput] = [] 
+            outputs: List[RequestOutput] = []
             outputs = await self.detokenizer.output_socket.recv_pyobj()
 
             for out in outputs:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e9557064ef900..0f12ddceaa026 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -329,7 +329,6 @@ def process_output_socket(self, output_path: str):
                 encoder.encode_into(outputs, buffer)
                 socket.send_multipart((buffer, ), copy=False)
 
-
 class MPEngineCoreClient:
     """
     MPEngineCoreClient: client for multi-proc EngineCore.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
deleted file mode 100644
index ba8b4c203801f..0000000000000
--- a/vllm/v1/engine/core_client.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import os
-import weakref
-from typing import List, Optional
-
-from vllm.logger import init_logger
-from vllm.utils import kill_process_tree, get_open_zmq_ipc_path
-from vllm.v1.engine import (BackgroundProcHandle, EngineCoreOutput,
-                            EngineRequest)
-from vllm.v1.engine.core import (EngineCore, EngineCoreProc)
-
-logger = init_logger(__name__)
-
-
-class EngineCoreClient:
-    """
-    EngineCoreClient: subclasses handle different methods for pushing 
-        and pulling from the EngineCore for asyncio / multiprocessing.
-
-    Subclasses:
-    * InprocClient: In process EngineCore (for V0-style LLMEngine use)
-    * SyncMPClient: ZMQ + background proc EngineCore (for LLM)
-    * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
-    """
-
-    @staticmethod
-    def make_client(
-        *args,
-        multiprocess_mode: bool,
-        asyncio_mode: bool,
-        **kwargs,
-    ) -> "EngineCoreClient":
-
-        # TODO: support this for debugging purposes.
-        if asyncio_mode and not multiprocess_mode:
-            raise NotImplementedError(
-                "Running EngineCore in asyncio without multiprocessing "
-                "is not currently supported.")
-
-        if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(*args, **kwargs)
-
-        if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(*args, **kwargs)
-
-        return InprocClient(*args, **kwargs)
-
-    def shutdown(self):
-        pass
-
-    def get_output(self) -> List[EngineCoreOutput]:
-        raise NotImplementedError
-
-    def add_request(self, request: EngineRequest) -> None:
-        raise NotImplementedError
-
-    def profile(self, is_start: bool = True) -> None:
-        raise NotImplementedError
-
-    def abort_requests(self, request_ids: List[str]) -> None:
-        raise NotImplementedError
-
-    async def get_output_async(self) -> List[EngineCoreOutput]:
-        raise NotImplementedError
-
-    async def add_request_async(self, request: EngineRequest) -> None:
-        raise NotImplementedError
-
-    async def profile_async(self, is_start: bool = True) -> None:
-        raise NotImplementedError
-
-    async def abort_requests_async(self, request_ids: List[str]) -> None:
-        raise NotImplementedError
-
-
-class InprocClient(EngineCoreClient):
-    """
-    InprocClient: client for in-process EngineCore. Intended 
-    for use in LLMEngine for V0-style add_request() and step()
-        EngineCore setup in this process (no busy loop).
-
-        * pushes EngineRequest directly into the EngineCore
-        * pulls EngineCoreOutputs by stepping the EngineCore
-
-        TODO: support asyncio-mode for debugging.
-    """
-
-    def __init__(self, *args, **kwargs):
-        self.engine_core = EngineCore(*args, **kwargs)
-
-    def get_output(self) -> List[EngineCoreOutput]:
-        return self.engine_core.step()
-
-    def add_request(self, request: EngineRequest) -> None:
-        self.engine_core.add_request(request)
-
-    def abort_requests(self, request_ids: List[str]) -> None:
-        self.engine_core.abort_requests(request_ids)
-
-    def shutdown(self):
-        self.engine_core.shutdown()
-
-    def __del__(self):
-        self.shutdown()
-
-    def profile(self, is_start: bool = True) -> None:
-        self.engine_core.profile(is_start)
-
-
-class MultiprocessEngineCore:
-    """
-    MultiprocessEngineCore: base client for multi-proc EngineCore.
-        EngineCore runs in a background process busy loop, getting
-        new EngineRequests and returning EngineCoreOutputs
-
-        * pushes EngineRequests via input_socket
-        * pulls EngineCoreOutputs via output_socket
-    """
-
-    def __init__(
-        self,
-        *args,
-        input_path: Optional[str] = None,
-        output_path: Optional[str] = None,
-        **kwargs,
-    ):
-        # Start EngineCore in background process.
-        self.proc_handle: Optional[BackgroundProcHandle]
-        self.proc_handle = EngineCoreProc.make_engine_core_process(
-            *args,
-            input_path=(input_path or get_open_zmq_ipc_path()),
-            output_path=(output_path or get_open_zmq_ipc_path()),
-            **kwargs,
-        )
-        self._finalizer = weakref.finalize(self, self.shutdown)
-
-    def shutdown(self):
-        if hasattr(self, "proc_handle") and self.proc_handle:
-            # Shutdown the process if needed.
-            if self.proc_handle.proc.is_alive():
-                self.proc_handle.proc.terminate()
-                self.proc_handle.proc.join(5)
-
-                if self.proc_handle.proc.is_alive():
-                    kill_process_tree(self.proc_handle.proc.pid)
-
-            # Remove zmq ipc socket files
-            ipc_sockets = [
-                self.proc_handle.ready_path, self.proc_handle.output_path,
-                self.proc_handle.input_path
-            ]
-            for ipc_socket in ipc_sockets:
-                socket_file = ipc_socket.replace("ipc://", "")
-                if os and os.path.exists(socket_file):
-                    os.remove(socket_file)
-            self.proc_handle = None
-
-    def __del__(self):
-        self.shutdown()
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8269877bb9c8f..11a9f01a54cf8 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -42,8 +42,8 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
-
-        # TODO: Can we avoid this?
+        
+        self.mulitprocess_mode = multiprocess_mode
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -62,22 +62,32 @@ def __init__(
                                    input_registry=input_registry,
                                    mm_registry=mm_registry)
 
-        # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
-        self.detokenizer = Detokenizer(
-            tokenizer_name=vllm_config.model_config.tokenizer,
-            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
-            trust_remote_code=vllm_config.model_config.trust_remote_code,
-            revision=vllm_config.model_config.tokenizer_revision,
-        )
-
-        # EngineCore (gets EngineRequests and gives EngineCoreOutputs)
-        self.engine_core = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            usage_context,
-            multiprocess_mode=multiprocess_mode,
-            asyncio_mode=False,
-        )
+        if self.multiprocess_mode:
+            # IPC paths.
+            from_engine_core_path = get_open_zmq_ipc_path()
+            to_engine_core_path = get_open_zmq_ipc_path()
+
+            # Detokenizer (background process).
+            self.detokenizer_client = MPDetokenizerClient(
+                from_engine_core_path=from_engine_core_path,
+                to_engine_core_path=to_engine_core_path,
+                tokenizer_name=vllm_config.model_config.tokenizer,
+                tokenizer_mode=vllm_config.model_config.tokenizer_mode,
+                trust_remote_code=vllm_config.model_config.trust_remote_code,
+                revision=vllm_config.model_config.tokenizer_revision,
+            )
+
+            # EngineCore (background process).
+            self.engine_core_client = MPEngineCoreClient(
+                input_path=to_engine_core_path,
+                output_path=from_engine_core_path,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                usage_context=usage_context,
+            )
+        
+        else:
+
 
     @classmethod
     def from_engine_args(
@@ -149,32 +159,38 @@ def add_request(
     ) -> None:
 
         # 1) Process raw inputs into the request.
-        detokenizer_req, engine_core_req = self.processor.process_inputs(
+        engine_request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
-        # 2) Add the request to Detokenizer.
-        self.detokenizer.add_request(detokenizer_req)
-
-        # 3) Add the request to EngineCore.
-        self.engine_core.add_request(engine_core_req)
+        # 2) Add to Detokenizer and EngineCore.
+        if self.multiprocess_mode:
+            # Send to Detokenizer (which forwards to EngineCore).
+            self.detokenizer.input_socket.send_pyobj(engine_request)
+        else:
+            # Add directly to Detokenizer and EngineCore.
+            self.detokenizer.add_request(engine_request)
+            self.engine_core.add_request(engine_request)
 
     def step(self) -> List[RequestOutput]:
+        
+        if self.multiprocess_mode:
+            # Get next output from the Detokenizer.
+            return self.detokenizer.output_socket.recv_pyobj()
 
-        # 1) Get EngineCoreOutput from the EngineCore.
-        engine_core_outputs = self.engine_core.get_output()
-
-        # 2) Detokenizer the EngineCoreOutput.
-        request_outputs, requests_to_abort = self.detokenizer.step(
-            engine_core_outputs)
-
-        # 3) Abort requests that finished due to stopping criteria.
-        if requests_to_abort:
-            self.abort_request(requests_to_abort)
-
-        return request_outputs
-
-    # TODO(rob): Can we get rid of these?
+        else:
+            # 1) Get EngineCoreOutput from the EngineCore.
+            engine_core_outputs = self.engine_core.step()
+            
+            # 2) Detokenizee the EngineCoreOutput.
+            request_outputs, request_to_abort = self.detokenizer.step(
+                engine_core_outputs)
+
+            # 3) Abort requests that finished due to stopping criteria.
+            if requests_to_abort:
+                self.abort_request(requests_to_abort)
+            
+            return request_outputs
 
     def get_model_config(self):
         return self.model_config

From 6f383f28db77b5306443faf96b1c27ec59255e12 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:09:14 +0000
Subject: [PATCH 095/132] updated

---
 vllm/v1/engine/detokenizer.py | 14 +++++++++++++-
 vllm/v1/engine/llm_engine.py  | 27 +++++++++++++++++++++------
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 8320d3260954a..dfd54bf8502e6 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -235,7 +235,19 @@ def abort_requests(
 
         for request_id in request_ids:
             self.request_states.pop(request_id, None)        
-        
+    
+    def add_request(	
+        self,	
+        request: EngineRequest,	
+    ):	
+        """Add new request to the Detokenizer."""	
+
+        assert (request.request_id not in self.request_states)	
+
+        request_state = IncrementalDetokenizer.from_new_request(	
+            self.tokenizer, request)	
+        self.request_states[request.request_id] = request_state
+
     def step(
         self, encore_core_outputs: EngineCoreOutputs,
     ) -> Tuple[List[RequestOutput], List[str]]:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 11a9f01a54cf8..0e845b94ecd2d 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -17,8 +17,9 @@
 from vllm.transformers_utils.tokenizer_group import (
     BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.utils import get_open_zmq_ipc_path
+from vllm.v1.engine.core import EngineCore, MPEngineCoreClient
+from vllm.v1.engine.detokenizer import Detokenizer, MPDetokenizerClient
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 
@@ -43,7 +44,7 @@ def __init__(
         multiprocess_mode: bool = False,
     ) -> None:
         
-        self.mulitprocess_mode = multiprocess_mode
+        self.multiprocess_mode = multiprocess_mode
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -87,7 +88,20 @@ def __init__(
             )
         
         else:
+            # Detokenizer (in process).
+            self.detokenizer = Detokenizer(
+                tokenizer_name=vllm_config.model_config.tokenizer,
+                tokenizer_mode=vllm_config.model_config.tokenizer_mode,
+                trust_remote_code=vllm_config.model_config.trust_remote_code,
+                revision=vllm_config.model_config.tokenizer_revision,
+            )
 
+            # EngineCore (in process).
+            self.engine_core = EngineCore(
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                usage_context=usage_context,
+            )
 
     @classmethod
     def from_engine_args(
@@ -143,6 +157,7 @@ def validate_outputs(cls, outputs, output_type):
     def abort_request(self, request_ids: List[str]) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
+        assert not self.multiprocess_mode
         self.engine_core.abort_requests(request_ids)
         self.detokenizer.abort_requests(request_ids)
 
@@ -166,7 +181,7 @@ def add_request(
         # 2) Add to Detokenizer and EngineCore.
         if self.multiprocess_mode:
             # Send to Detokenizer (which forwards to EngineCore).
-            self.detokenizer.input_socket.send_pyobj(engine_request)
+            self.detokenizer_client.input_socket.send_pyobj(engine_request)
         else:
             # Add directly to Detokenizer and EngineCore.
             self.detokenizer.add_request(engine_request)
@@ -176,14 +191,14 @@ def step(self) -> List[RequestOutput]:
         
         if self.multiprocess_mode:
             # Get next output from the Detokenizer.
-            return self.detokenizer.output_socket.recv_pyobj()
+            return self.detokenizer_client.output_socket.recv_pyobj()
 
         else:
             # 1) Get EngineCoreOutput from the EngineCore.
             engine_core_outputs = self.engine_core.step()
             
             # 2) Detokenizee the EngineCoreOutput.
-            request_outputs, request_to_abort = self.detokenizer.step(
+            request_outputs, requests_to_abort = self.detokenizer.step(
                 engine_core_outputs)
 
             # 3) Abort requests that finished due to stopping criteria.

From 30d7333fd8d5ddc330e460d8fb47129bb61ae6e8 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:09:35 +0000
Subject: [PATCH 096/132] nit

---
 vllm/v1/engine/detokenizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index dfd54bf8502e6..5ed1327630269 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -201,6 +201,7 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
             return self.output_text[last_offset:length]
         return ""
 
+
 class Detokenizer:
 
     def __init__(self,

From bd49c9cb42be711414ed19036835bba051066f62 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:10:26 +0000
Subject: [PATCH 097/132] updated

---
 vllm/v1/engine/detokenizer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 5ed1327630269..4ec5808660883 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -235,18 +235,18 @@ def abort_requests(
         """Remove the request_ids from the Detokenizer."""
 
         for request_id in request_ids:
-            self.request_states.pop(request_id, None)        
-    
+            self.request_states.pop(request_id, None)
+
     def add_request(	
         self,	
         request: EngineRequest,	
     ):	
-        """Add new request to the Detokenizer."""	
+        """Add new request to the Detokenizer."""
 
         assert (request.request_id not in self.request_states)	
 
-        request_state = IncrementalDetokenizer.from_new_request(	
-            self.tokenizer, request)	
+        request_state = IncrementalDetokenizer.from_new_request(
+            self.tokenizer, request)
         self.request_states[request.request_id] = request_state
 
     def step(

From 2192ae61e62df4777c20c2cdb97a4f8758cb204c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:11:15 +0000
Subject: [PATCH 098/132] make PR cleaner

---
 vllm/entrypoints/openai/api_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 28542c804bbae..ed3634a50e7e2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -674,12 +674,14 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     def sigterm_handler(*_) -> None:
         # Interrupt server on sigterm while initializing
         raise KeyboardInterrupt("terminated")
+
     signal.signal(signal.SIGTERM, sigterm_handler)
 
     # The child processes will send SIGQUIT to this process when
     # any error happens. This process then clean up the whole tree.
     def sigquit_handler(signum, frame):
         kill_process_tree(os.getpid())
+
     signal.signal(signal.SIGQUIT, sigquit_handler)
 
 

From 611d1b0ba997eefc747002b83ae1254620cdd2b3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:12:20 +0000
Subject: [PATCH 099/132] make PR cleaner

---
 vllm/v1/engine/core.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0f12ddceaa026..5270f4ebc801f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -110,6 +110,9 @@ def add_request(self, request: EngineRequest):
     def abort_requests(self, request_ids: List[str]):
         """Abort requests from the scheduler."""
 
+        # TODO: The scheduler doesn't really need to know the	
+        # specific finish reason, TBD whether we propagate that	
+        # (i.e. client-aborted vs stop criteria met).
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 

From b12d0e6923b61b59e9042a1162a339f18f0fc47b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:15:06 +0000
Subject: [PATCH 100/132] make pr cleaner

---
 vllm/v1/engine/detokenizer.py | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 4ec5808660883..3789979f8c8d4 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -237,13 +237,13 @@ def abort_requests(
         for request_id in request_ids:
             self.request_states.pop(request_id, None)
 
-    def add_request(	
-        self,	
-        request: EngineRequest,	
-    ):	
+    def add_request(
+        self,
+        request: EngineRequest,
+    ):
         """Add new request to the Detokenizer."""
 
-        assert (request.request_id not in self.request_states)	
+        assert (request.request_id not in self.request_states)
 
         request_state = IncrementalDetokenizer.from_new_request(
             self.tokenizer, request)
@@ -400,25 +400,10 @@ def _handle_from_llm_engine(
         pickled_req = from_llm_engine.recv()
         req = pickle.loads(pickled_req)
 
-        # Request added by client, add to RequestStates.
         if isinstance(req, EngineRequest):
-            if req.request_id in self.request_states:
-                raise ValueError(
-                    f"{req.request_id} already in Request States!")
-
-            # Add to RequestStates.
-            request_state = IncrementalDetokenizer.from_new_request(
-                self.tokenizer, req)
-            self.request_states[req.request_id] = request_state
-
-        # Request aborted by client, delete from RequestStates.
+            self.add_request(req)
         elif isinstance(req, EngineAbortRequest):
-            if req.request_id not in self.request_states:
-                # If not found, the request is already completed
-                # and we can safely ignore.
-                pass
-            del self.request_states[req.request_id]
-            
+            self.abort_requests(req.request_ids)
         else:
             raise ValueError(f"Unknown type: {req}")
 

From 1dac1f1b9bf59f6fceccd7ed1f6994196cff6a5b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:15:53 +0000
Subject: [PATCH 101/132] more cleanup

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 586db9ef804c0..380266419df52 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -58,7 +58,7 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
         assert start_engine_loop
-        
+
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers = stat_loggers

From 40c5cd5348fcb9c41533e35185fe47674edf0dfe Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:16:42 +0000
Subject: [PATCH 102/132] more cleanup

---
 vllm/v1/engine/async_llm.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 380266419df52..7f2a597bbdd97 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -76,11 +76,13 @@ def __init__(
         self.tokenizer.ping()
 
         # Processor (converts Inputs --> EngineRequest).
-        self.processor = Processor(model_config=vllm_config.model_config,
-                                   cache_config=vllm_config.cache_config,
-                                   lora_config=vllm_config.lora_config,
-                                   tokenizer=self.tokenizer,
-                                   input_registry=input_registry)
+        self.processor = Processor(
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+            tokenizer=self.tokenizer,
+            input_registry=input_registry,
+        )
 
         # IPC paths.
         from_engine_core_path = get_open_zmq_ipc_path()

From a1e17c41977b88a9640806e2d21d4d48a2cee197 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:18:48 +0000
Subject: [PATCH 103/132] updated

---
 vllm/v1/engine/async_llm.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7f2a597bbdd97..3601085801a4c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -89,7 +89,7 @@ def __init__(
         to_engine_core_path = get_open_zmq_ipc_path()
 
         # Detokenizer (background process).
-        self.detokenizer = MPDetokenizerClient(
+        self.detokenizer_client = MPDetokenizerClient(
             from_engine_core_path=from_engine_core_path,
             to_engine_core_path=to_engine_core_path,
             tokenizer_name=vllm_config.model_config.tokenizer,
@@ -99,7 +99,7 @@ def __init__(
         )
 
         # EngineCore (background process).
-        self.engine_core = MPEngineCoreClient(
+        self.engine_core_client = MPEngineCoreClient(
             input_path=to_engine_core_path,
             output_path=from_engine_core_path,
             vllm_config=vllm_config,
@@ -148,11 +148,11 @@ def shutdown(self):
         if output_handler := getattr(self, "output_hander", None):
             output_handler.cancel()
 
-        if engine_core := getattr(self, "engine_core", None):
-            engine_core.shutdown()
+        if engine_core_client := getattr(self, "engine_core_client", None):
+            engine_core_client.shutdown()
 
-        if detokenizer := getattr(self, "detokenizer", None):
-            detokenizer.shutdown()
+        if detokenizer_client := getattr(self, "detokenizer_client", None):
+            detokenizer_client.shutdown()
 
     @classmethod
     def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
@@ -190,7 +190,7 @@ async def add_request(
         self.rid_to_queue[request_id] = asyncio.Queue()
 
         # 3) Send to Detokenizer (which forwards to EngineCore).
-        await self.detokenizer.input_socket.send_pyobj(engine_request)
+        await self.detokenizer_client.input_socket.send_pyobj(engine_request)
 
         return self.rid_to_queue[request_id]
 
@@ -272,7 +272,7 @@ async def output_handler_loop(self):
             # Note: use socket directly to avoid calling await multiple
             # times, which causes too much task switching at high QPS.
             outputs: List[RequestOutput] = []
-            outputs = await self.detokenizer.output_socket.recv_pyobj()
+            outputs = await self.detokenizer_client.output_socket.recv_pyobj()
 
             for out in outputs:
                 # Note: it is possible that a request was aborted
@@ -286,7 +286,7 @@ async def abort(self, request_id: str):
         """Abort request if the client cancels the request."""
 
         # Send abort to Detokenizer (which will fwd to EngineCore)
-        await self.detokenizer.input_socket.send_pyobj(
+        await self.detokenizer_client.input_socket.send_pyobj(
             EngineAbortRequest([request_id]))
 
         # Remove from request output queues.
@@ -336,10 +336,10 @@ async def check_health(self) -> None:
         logger.debug("Called check_health.")
 
     async def start_profile(self) -> None:
-        await self.engine_core.profile_async(True)
+        await self.engine_core_client.profile_async(True)
 
     async def stop_profile(self) -> None:
-        await self.engine_core.profile_async(False)
+        await self.engine_core_client.profile_async(False)
 
     @property
     def is_running(self) -> bool:

From 921a56aeb1bf1c9c36b36a4602d1c46ffbba7f36 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:19:38 +0000
Subject: [PATCH 104/132] updated comment

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3601085801a4c..23c6c1938e091 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -75,7 +75,7 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Processor (converts Inputs --> EngineRequest).
+        # Processor (in process).
         self.processor = Processor(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,

From 19aadbb1195481577c9f53bfc8295009f436ca73 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:20:49 +0000
Subject: [PATCH 105/132] updated

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 23c6c1938e091..fe755df95cbe7 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -285,7 +285,7 @@ async def output_handler_loop(self):
     async def abort(self, request_id: str):
         """Abort request if the client cancels the request."""
 
-        # Send abort to Detokenizer (which will fwd to EngineCore)
+        # Send abort to Detokenizer (which will fwd to EngineCore).
         await self.detokenizer_client.input_socket.send_pyobj(
             EngineAbortRequest([request_id]))
 

From ddae79cf2934ff659af14b72eb03471d52eea8ef Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:23:05 +0000
Subject: [PATCH 106/132] updated

---
 vllm/v1/engine/llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 0e845b94ecd2d..9bd26ded88b53 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -43,7 +43,7 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
-        
+
         self.multiprocess_mode = multiprocess_mode
         self.model_config = vllm_config.model_config
 

From 4b00ae04c454f34813635b9e5b8500a757b64dcf Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:47:32 +0000
Subject: [PATCH 107/132] factor out proc handle code

---
 vllm/v1/engine/__init__.py    | 20 +++++++++++++--
 vllm/v1/engine/core.py        | 48 +++++++++++------------------------
 vllm/v1/engine/detokenizer.py | 25 +++++++++---------
 vllm/v1/utils.py              | 26 ++-----------------
 4 files changed, 47 insertions(+), 72 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index ae10e6f3b8c29..849592d05b1b2 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,3 +1,4 @@
+import os
 from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
 from typing import List, Optional, Union
@@ -7,15 +8,30 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.sampling_params import SamplingParams
-
+from vllm.utils import kill_process_tree
 
 @dataclass
 class BackgroundProcHandle:
     proc: BaseProcess
-    ready_path: str
     input_path: str
     output_path: str
 
+    def shutdown(self):
+        # Shutdown the process if needed.
+        if self.proc.is_alive():
+            self.proc.terminate()
+            self.proc.join(5)
+
+            if self.proc.is_alive():
+                kill_process_tree(self.proc.pid)
+
+        # Remove zmq ipc socket files
+        ipc_sockets = [self.output_path, self.input_path]
+        for ipc_socket in ipc_sockets:
+            socket_file = ipc_socket.replace("ipc://", "")
+            if os and os.path.exists(socket_file):
+                os.remove(socket_file)
+
 
 @dataclass
 class EngineRequest:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5270f4ebc801f..f4edb1b4e12a2 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -10,6 +10,7 @@
 import zmq
 import zmq.asyncio
 from msgspec import msgpack
+from multiprocessing.connection import Connection
 
 from vllm.config import CacheConfig, VllmConfig
 from vllm.executor.multiproc_worker_utils import get_mp_context
@@ -17,8 +18,7 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (get_open_zmq_ipc_path, get_exception_traceback,
-                        kill_process_tree)
+from vllm.utils import get_exception_traceback, kill_process_tree
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineAbortRequest, EngineRequest,
@@ -27,7 +27,7 @@
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.utils import zmq_socket_ctx, wait_for_startup
+from vllm.v1.utils import zmq_socket_ctx
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -138,8 +138,6 @@ def profile(self, is_start: bool = True):
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
-    READY_STR = "READY"
-
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -147,7 +145,7 @@ def __init__(
         usage_context: UsageContext,
         input_path: str,
         output_path: str,
-        ready_path: str,
+        ready_pipe: Connection,
     ):
         super().__init__(vllm_config, executor_class, usage_context)
 
@@ -166,8 +164,7 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket:
-            ready_socket.send_string(EngineCoreProc.READY_STR)
+        ready_pipe.send({"status": "READY"})
 
     @staticmethod
     def make_engine_core_process(
@@ -178,27 +175,29 @@ def make_engine_core_process(
         output_path: str,
     ) -> BackgroundProcHandle:
         context = get_mp_context()
-        ready_path = get_open_zmq_ipc_path()
+        reader, writer = context.Pipe(duplex=False)
 
         process_kwargs = {
             "input_path": input_path,
             "output_path": output_path,
-            "ready_path": ready_path,
+            "ready_pipe": writer,
             "vllm_config": vllm_config,
             "executor_class": executor_class,
             "usage_context": usage_context,
         }
+        
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=EngineCoreProc.run_engine_core,
                                kwargs=process_kwargs)
         proc.start()
-        wait_for_startup(proc=proc,
-                         ready_path=ready_path,
-                         ready_str=EngineCoreProc.READY_STR,
-                         timeout_ms=POLLING_TIMEOUT_MS)
+
+        # Wait for startup.
+        if reader.recv()["status"] != "READY":
+            raise RuntimeError(
+                "EngineCore initalization failed. See root cause above."
+            )
 
         return BackgroundProcHandle(proc=proc,
-                                    ready_path=ready_path,
                                     input_path=input_path,
                                     output_path=output_path)
 
@@ -355,24 +354,7 @@ def __init__(self, *args, input_path: str, output_path: str, **kwargs):
 
     def shutdown(self):
         if hasattr(self, "proc_handle") and self.proc_handle:
-            # Shutdown the process if needed.
-            if self.proc_handle.proc.is_alive():
-                self.proc_handle.proc.terminate()
-                self.proc_handle.proc.join(5)
-
-                if self.proc_handle.proc.is_alive():
-                    kill_process_tree(self.proc_handle.proc.pid)
-
-            # Remove zmq ipc socket files
-            ipc_sockets = [
-                self.proc_handle.ready_path,
-                self.proc_handle.output_path,
-                self.proc_handle.input_path
-            ]
-            for ipc_socket in ipc_sockets:
-                socket_file = ipc_socket.replace("ipc://", "")
-                if os and os.path.exists(socket_file):
-                    os.remove(socket_file)
+            self.proc_handle.shutdown()
             self.proc_handle = None
 
     def __del__(self):
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 3789979f8c8d4..be9857f1f6aa0 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -298,7 +298,7 @@ def __init__(
         to_engine_core_path: str,
         input_path: str,
         output_path: str,
-        ready_path: str,
+        write_: str,
         **kwargs
     ):
         super().__init__(*args, **kwargs)
@@ -325,14 +325,14 @@ def make_detokenizer_process(
         revision: Optional[str] = None,
     ) -> BackgroundProcHandle:
         context = get_mp_context()
-        ready_path = get_open_zmq_ipc_path()
+        reader, writer = context.Pipe(duplex=False)
 
         process_kwargs = {
             "from_engine_core_path": from_engine_core_path,
             "to_engine_core_path": to_engine_core_path,
             "input_path": input_path,
             "output_path": output_path,
-            "ready_path": ready_path,
+            "ready_pipe": writer,
             "tokenizer_name": tokenizer_name,
             "tokenizer_mode": tokenizer_mode,
             "trust_remote_code": trust_remote_code,
@@ -342,13 +342,14 @@ def make_detokenizer_process(
         proc = context.Process(target=DetokenizerProc.run_detokenizer,
                                kwargs=process_kwargs)
         proc.start()
-        wait_for_startup(proc=proc,
-                         ready_path=ready_path,
-                         ready_str=DetokenizerProc.READY_STR,
-                         timeout_ms=POLLING_TIMEOUT_MS)
+        
+        # Wait for startup.
+        if reader.recv()["status"] != "READY":
+            raise RuntimeError(
+                "Detokenizer initalization failed. See root cause above."
+            )
 
         return BackgroundProcHandle(proc=proc,
-                                    ready_path=ready_path,
                                     input_path=input_path,
                                     output_path=output_path)
     
@@ -512,8 +513,6 @@ def __init__(self,
         )
     
     def shutdown(self):
-        self.proc_handle.proc.terminate()
-        self.proc_handle.proc.join(5)
-
-        if self.proc_handle.proc.is_alive():
-            kill_process_tree(self.proc_handle.proc.pid)
+        if hasattr(self, "proc_handle") and self.proc_handle:
+            self.proc_handle.shutdown()
+            self.proc_handle = None
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 492c9094f8307..f610c4c90d3dd 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,4 +1,5 @@
 from multiprocessing.process import BaseProcess
+from multiprocessing.connection import Connection
 
 from collections.abc import Sequence
 from contextlib import contextmanager
@@ -125,27 +126,4 @@ def zmq_socket_ctx(
 
     finally:
         ctx.destroy(linger=0)
-
-
-def wait_for_startup(
-    proc: BaseProcess,
-    ready_path: str,
-    ready_str: str,
-    timeout_ms: int,
-) -> None:
-    """Wait until a background process is ready."""
-
-    with zmq_socket_ctx(ready_path, zmq.PULL) as socket:
-        try:
-            while socket.poll(timeout=timeout_ms) == 0:
-                logger.debug("Waiting for background proc to startup.")
-
-                if not proc.is_alive():
-                    raise RuntimeError("Background process failed to start.")
-
-            message = socket.recv_string()
-            assert message == ready_str
-
-        except BaseException as e:
-            logger.exception(e)
-            raise e
+    

From 467d63e9bba5f0ad7574461e0d334fe323a65151 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 19:47:50 +0000
Subject: [PATCH 108/132] actually save before commiting

---
 vllm/v1/engine/core.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f4edb1b4e12a2..59b66d0de11b9 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,7 +3,6 @@
 import signal
 import threading
 import time
-import os
 import weakref
 from typing import List, Optional, Tuple, Type
 
@@ -18,7 +17,7 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import get_exception_traceback, kill_process_tree
+from vllm.utils import get_exception_traceback
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineAbortRequest, EngineRequest,

From afd4b522d83502a7b317ec701a7b67c1389bd050 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 20:07:09 +0000
Subject: [PATCH 109/132] actually save before commiting

---
 vllm/v1/engine/__init__.py   | 25 -------------------
 vllm/v1/engine/async_llm.py  |  4 ++-
 vllm/v1/engine/core.py       | 31 ++++++------------------
 vllm/v1/engine/llm_engine.py | 20 ++++++++-------
 vllm/v1/utils.py             | 47 ++++++++++++++++++++++++++++++++++--
 5 files changed, 67 insertions(+), 60 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 849592d05b1b2..3af0219db1c14 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,6 +1,4 @@
-import os
 from dataclasses import dataclass
-from multiprocessing.process import BaseProcess
 from typing import List, Optional, Union
 
 import msgspec
@@ -8,29 +6,6 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.sampling_params import SamplingParams
-from vllm.utils import kill_process_tree
-
-@dataclass
-class BackgroundProcHandle:
-    proc: BaseProcess
-    input_path: str
-    output_path: str
-
-    def shutdown(self):
-        # Shutdown the process if needed.
-        if self.proc.is_alive():
-            self.proc.terminate()
-            self.proc.join(5)
-
-            if self.proc.is_alive():
-                kill_process_tree(self.proc.pid)
-
-        # Remove zmq ipc socket files
-        ipc_sockets = [self.output_path, self.input_path]
-        for ipc_socket in ipc_sockets:
-            socket_file = ipc_socket.replace("ipc://", "")
-            if os and os.path.exists(socket_file):
-                os.remove(socket_file)
 
 
 @dataclass
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index fe755df95cbe7..a585b9bcac80d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -186,10 +186,12 @@ async def add_request(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
-        # 2) Create Queue (output_handler() pushes, generate() pulls)
+        # 2) Create Queue (output_handler() pushes, generate() pulls).
         self.rid_to_queue[request_id] = asyncio.Queue()
 
         # 3) Send to Detokenizer (which forwards to EngineCore).
+        # Note: we forward the request rather than sending to each
+        # process separately to avoid race conditions in Detokenizer.
         await self.detokenizer_client.input_socket.send_pyobj(engine_request)
 
         return self.rid_to_queue[request_id]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 59b66d0de11b9..01b0f3a92a948 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,8 +3,7 @@
 import signal
 import threading
 import time
-import weakref
-from typing import List, Optional, Tuple, Type
+from typing import List, Tuple, Type
 
 import zmq
 import zmq.asyncio
@@ -26,7 +25,8 @@
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.utils import zmq_socket_ctx
+from vllm.v1.utils import (zmq_socket_ctx, BackgroundProcHandle, 
+                           MPBackgroundProcess)
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -330,31 +330,16 @@ def process_output_socket(self, output_path: str):
                 encoder.encode_into(outputs, buffer)
                 socket.send_multipart((buffer, ), copy=False)
 
-class MPEngineCoreClient:
-    """
-    MPEngineCoreClient: client for multi-proc EngineCore.
-        EngineCore runs in a background process busy loop, getting
-        new EngineRequests and returning EngineCoreOutputs
 
-        * pushes EngineRequests via input_socket
-        * pulls EngineCoreOutputs via output_socket
-    """
+class MPEngineCoreClient(MPBackgroundProcess):
+    """MPEngineCoreClient: client for multi-proc EngineCore."""
 
     def __init__(self, *args, input_path: str, output_path: str, **kwargs):
-        # Start EngineCore in background process.
-        self.proc_handle: Optional[BackgroundProcHandle]
-        self.proc_handle = EngineCoreProc.make_engine_core_process(
+        super().__init__(
             *args,
+            fn=EngineCoreProc.make_engine_core_process,
             input_path=input_path,
             output_path=output_path,
-            **kwargs,
         )
-        self._finalizer = weakref.finalize(self, self.shutdown)
 
-    def shutdown(self):
-        if hasattr(self, "proc_handle") and self.proc_handle:
-            self.proc_handle.shutdown()
-            self.proc_handle = None
-
-    def __del__(self):
-        self.shutdown()
+    
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 9bd26ded88b53..caef661320cb2 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -173,14 +173,16 @@ def add_request(
         priority: int = 0,
     ) -> None:
 
-        # 1) Process raw inputs into the request.
+        # Process raw inputs into the request.
         engine_request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
-        # 2) Add to Detokenizer and EngineCore.
+        # Add to Detokenizer and EngineCore.
         if self.multiprocess_mode:
             # Send to Detokenizer (which forwards to EngineCore).
+            # Note: we forward the message rather than sending
+            # to each process separately to avoid race conditions.
             self.detokenizer_client.input_socket.send_pyobj(engine_request)
         else:
             # Add directly to Detokenizer and EngineCore.
@@ -192,16 +194,13 @@ def step(self) -> List[RequestOutput]:
         if self.multiprocess_mode:
             # Get next output from the Detokenizer.
             return self.detokenizer_client.output_socket.recv_pyobj()
-
         else:
-            # 1) Get EngineCoreOutput from the EngineCore.
+            # Step EngineCore and Detokenizer.
             engine_core_outputs = self.engine_core.step()
-            
-            # 2) Detokenizee the EngineCoreOutput.
             request_outputs, requests_to_abort = self.detokenizer.step(
                 engine_core_outputs)
 
-            # 3) Abort requests that finished due to stopping criteria.
+            # Abort any requests that hit a stop string.
             if requests_to_abort:
                 self.abort_request(requests_to_abort)
             
@@ -236,5 +235,8 @@ def __del__(self):
         self.shutdown()
 
     def shutdown(self):
-        if engine_core := getattr(self, "engine_core", None):
-            engine_core.shutdown()
+        if engine_core_client := getattr(self, "engine_core_client", None):
+            engine_core_client.shutdown()
+
+        if detokenizer_client := getattr(self, "detokenizer_client", None):
+            detokenizer_client.shutdown()
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index f610c4c90d3dd..bf0712c80c81d 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -4,7 +4,7 @@
 from collections.abc import Sequence
 from contextlib import contextmanager
 from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
-                    overload)
+                    overload, Callable)
 
 import zmq
 import zmq.asyncio
@@ -126,4 +126,47 @@ def zmq_socket_ctx(
 
     finally:
         ctx.destroy(linger=0)
-    
+
+from multiprocessing.process import BaseProcess
+from vllm.utils import kill_process_tree
+import os
+import weakref
+from dataclasses import dataclass
+
+@dataclass
+class BackgroundProcHandle:
+    proc: BaseProcess
+    input_path: str
+    output_path: str
+
+    def shutdown(self):
+        # Shutdown the process if needed.
+        if self.proc.is_alive():
+            self.proc.terminate()
+            self.proc.join(5)
+
+            if self.proc.is_alive():
+                kill_process_tree(self.proc.pid)
+
+        # Remove zmq ipc socket files
+        ipc_sockets = [self.output_path, self.input_path]
+        for ipc_socket in ipc_sockets:
+            socket_file = ipc_socket.replace("ipc://", "")
+            if os and os.path.exists(socket_file):
+                os.remove(socket_file)
+
+
+class MPBackgroundProcess:
+    def __init__(self, *args, fn: Callable, input_path: str, output_path: str, **kwargs):
+        # Start EngineCore in background process.
+        self.proc_handle: Optional[BackgroundProcHandle]
+        self.proc_handle = fn(*args, input_path, output_path, kwargs)
+        self._finalizer = weakref.finalize(self, self.shutdown)
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        if hasattr(self, "proc_handle") and self.proc_handle:
+            self.proc_handle.shutdown()
+            self.proc_handle = None
\ No newline at end of file

From 395742e92c7830970f914cc8f143b1f0e2061d7c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 20:07:19 +0000
Subject: [PATCH 110/132] again

---
 vllm/v1/engine/core.py | 3 +--
 vllm/v1/utils.py       | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 01b0f3a92a948..f1cde6c3085e9 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -20,8 +20,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineAbortRequest, EngineRequest,
-                            EngineProfileRequest, EngineRequestUnion,
-                            BackgroundProcHandle)
+                            EngineProfileRequest, EngineRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index bf0712c80c81d..53a4c2dc949db 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -157,6 +157,7 @@ def shutdown(self):
 
 
 class MPBackgroundProcess:
+    
     def __init__(self, *args, fn: Callable, input_path: str, output_path: str, **kwargs):
         # Start EngineCore in background process.
         self.proc_handle: Optional[BackgroundProcHandle]

From 2d6ceb87de498370b19f1b8cc682070d3235e17d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 20:57:08 +0000
Subject: [PATCH 111/132] updated

---
 vllm/v1/engine/async_llm.py   |  43 +++++++++-----
 vllm/v1/engine/core.py        |  73 ++++++++---------------
 vllm/v1/engine/detokenizer.py | 107 +++++++++-------------------------
 vllm/v1/utils.py              |  58 +++++++++++++-----
 4 files changed, 125 insertions(+), 156 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a585b9bcac80d..85f4b29021be6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -16,6 +16,9 @@
 # Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py
 
 import asyncio
+import zmq
+import zmq.asyncio
+
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -39,6 +42,7 @@
 from vllm.v1.engine.detokenizer import MPDetokenizerClient
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.utils import zmq_socket_ctx, make_zmq_socket
 
 logger = init_logger(__name__)
 
@@ -87,9 +91,18 @@ def __init__(
         # IPC paths.
         from_engine_core_path = get_open_zmq_ipc_path()
         to_engine_core_path = get_open_zmq_ipc_path()
+        self.to_detokenizer_path = get_open_zmq_ipc_path()
+        self.from_detokenizer_path = get_open_zmq_ipc_path()
+
+        # Detokenizer IPC.
+        self.ctx = zmq.asyncio.Context(io_threads=2)
+        self.to_detokenizer = make_zmq_socket(
+            self.ctx, self.to_detokenizer_path, zmq.PULL)
 
         # Detokenizer (background process).
         self.detokenizer_client = MPDetokenizerClient(
+            output_path=self.from_detokenizer_path,
+            input_path=self.to_detokenizer_path,
             from_engine_core_path=from_engine_core_path,
             to_engine_core_path=to_engine_core_path,
             tokenizer_name=vllm_config.model_config.tokenizer,
@@ -145,6 +158,9 @@ def from_engine_args(
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
+        if ctx := getattr(self, "ctx", None):
+            ctx.destroy(linger=0)
+        
         if output_handler := getattr(self, "output_hander", None):
             output_handler.cancel()
 
@@ -192,7 +208,7 @@ async def add_request(
         # 3) Send to Detokenizer (which forwards to EngineCore).
         # Note: we forward the request rather than sending to each
         # process separately to avoid race conditions in Detokenizer.
-        await self.detokenizer_client.input_socket.send_pyobj(engine_request)
+        await self.to_detokenizer.send_pyobj(engine_request)
 
         return self.rid_to_queue[request_id]
 
@@ -270,18 +286,19 @@ async def generate(
     async def output_handler_loop(self):
         """Background loop: pulls from Detokenizer and push to Queues."""
 
-        while True:
-            # Note: use socket directly to avoid calling await multiple
-            # times, which causes too much task switching at high QPS.
-            outputs: List[RequestOutput] = []
-            outputs = await self.detokenizer_client.output_socket.recv_pyobj()
-
-            for out in outputs:
-                # Note: it is possible that a request was aborted
-                # due to client cancellation while EngineCoreOutputs
-                # are still flowing, so we just ignore.
-                if out.request_id in self.rid_to_queue:
-                    self.rid_to_queue[out.request_id].put_nowait(out)
+        with zmq_socket_ctx(self.from_detokenizer_path, zmq.PULL) as socket:
+            while True:
+                # Note: use socket directly to avoid calling await multiple
+                # times, which causes too much task switching at high QPS.
+                outputs: List[RequestOutput] = []
+                outputs = await socket.recv_pyobj()
+
+                for out in outputs:
+                    # Note: it is possible that a request was aborted
+                    # due to client cancellation while EngineCoreOutputs
+                    # are still flowing, so we just ignore.
+                    if out.request_id in self.rid_to_queue:
+                        self.rid_to_queue[out.request_id].put_nowait(out)
             
 
     async def abort(self, request_id: str):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f1cde6c3085e9..9781ceaf04e56 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -24,8 +24,7 @@
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.utils import (zmq_socket_ctx, BackgroundProcHandle, 
-                           MPBackgroundProcess)
+from vllm.v1.utils import zmq_socket_ctx, MPBackgroundProcess
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -136,6 +135,8 @@ def profile(self, is_start: bool = True):
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
+    READY_STR = "READY"
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -148,10 +149,9 @@ def __init__(
         super().__init__(vllm_config, executor_class, usage_context)
 
         # Background Threads and Queues for IO. These enable us to
-        # overlap ZMQ socket IO with GPU since they release the GIL,
-        # and to overlap some serialization/deserialization with the
-        # model forward pass.
-        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+        # overlap ZMQ IO with GPU since they release the GIL and 
+        # some serialization/deserialization with the model forward.
+        # Threads handle Socket <-> Queues and busy_loop uses Queues.
         self.input_queue: queue.Queue[EngineRequestUnion] = queue.Queue()
         self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
@@ -162,42 +162,8 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        ready_pipe.send({"status": "READY"})
-
-    @staticmethod
-    def make_engine_core_process(
-        vllm_config: VllmConfig,
-        executor_class: Type[Executor],
-        usage_context: UsageContext,
-        input_path: str,
-        output_path: str,
-    ) -> BackgroundProcHandle:
-        context = get_mp_context()
-        reader, writer = context.Pipe(duplex=False)
-
-        process_kwargs = {
-            "input_path": input_path,
-            "output_path": output_path,
-            "ready_pipe": writer,
-            "vllm_config": vllm_config,
-            "executor_class": executor_class,
-            "usage_context": usage_context,
-        }
-        
-        # Run EngineCore busy loop in background process.
-        proc = context.Process(target=EngineCoreProc.run_engine_core,
-                               kwargs=process_kwargs)
-        proc.start()
-
-        # Wait for startup.
-        if reader.recv()["status"] != "READY":
-            raise RuntimeError(
-                "EngineCore initalization failed. See root cause above."
-            )
+        ready_pipe.send({"status": EngineCoreProc.READY_STR})
 
-        return BackgroundProcHandle(proc=proc,
-                                    input_path=input_path,
-                                    output_path=output_path)
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
@@ -331,14 +297,25 @@ def process_output_socket(self, output_path: str):
 
 
 class MPEngineCoreClient(MPBackgroundProcess):
-    """MPEngineCoreClient: client for multi-proc EngineCore."""
+    """Client for multi-proc EngineCore."""
 
-    def __init__(self, *args, input_path: str, output_path: str, **kwargs):
-        super().__init__(
-            *args,
-            fn=EngineCoreProc.make_engine_core_process,
+    def __init__(self,
+                 input_path: str,
+                 output_path: str,
+                 vllm_config: VllmConfig,
+                 executor_class: Type[Executor],
+                 usage_context: UsageContext):
+
+        super().__init__()
+
+        self.proc_handle = MPBackgroundProcess.wait_for_startup(
             input_path=input_path,
             output_path=output_path,
+            process_name="EngineCore",
+            target_fn=EngineCoreProc.run_engine_core,
+            process_kwargs={
+                "vllm_config": vllm_config,
+                "executor_class": executor_class,
+                "usage_context": usage_context,
+            },
         )
-
-    
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index be9857f1f6aa0..455fb442f6c51 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -4,10 +4,10 @@
 import msgspec
 import signal
 from dataclasses import dataclass
+from multiprocessing.connection import Connection
 from typing import Dict, Iterable, List, Optional, Tuple,Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
@@ -17,10 +17,8 @@
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                         get_exception_traceback)
 from vllm.v1.engine import (EngineCoreOutputs,
-                            BackgroundProcHandle, 
                             EngineRequest, EngineAbortRequest)
-from vllm.v1.utils import (make_zmq_socket, zmq_socket_ctx, 
-                           wait_for_startup)
+from vllm.v1.utils import zmq_socket_ctx, MPBackgroundProcess
 
 logger = init_logger(__name__)
 
@@ -298,8 +296,8 @@ def __init__(
         to_engine_core_path: str,
         input_path: str,
         output_path: str,
-        write_: str,
-        **kwargs
+        ready_pipe: Connection,
+        **kwargs,
     ):
         super().__init__(*args, **kwargs)
 
@@ -308,50 +306,9 @@ def __init__(
         self.input_path = input_path
         self.output_path = output_path
 
-        # Send readiness signal.
-        with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket:
-            ready_socket.send_string(DetokenizerProc.READY_STR)
+        # Send Readiness signal to DetokenizerClient.
+        ready_pipe.send({"status": DetokenizerProc.READY_STR})
 
-
-    @staticmethod
-    def make_detokenizer_process(
-        from_engine_core_path: str,
-        to_engine_core_path: str,
-        input_path: str,
-        output_path: str,
-        tokenizer_name: str,
-        tokenizer_mode: str = "auto",
-        trust_remote_code: bool = False,
-        revision: Optional[str] = None,
-    ) -> BackgroundProcHandle:
-        context = get_mp_context()
-        reader, writer = context.Pipe(duplex=False)
-
-        process_kwargs = {
-            "from_engine_core_path": from_engine_core_path,
-            "to_engine_core_path": to_engine_core_path,
-            "input_path": input_path,
-            "output_path": output_path,
-            "ready_pipe": writer,
-            "tokenizer_name": tokenizer_name,
-            "tokenizer_mode": tokenizer_mode,
-            "trust_remote_code": trust_remote_code,
-            "revision": revision,
-        }
-        # Run Detokenizer busy loop in background process.
-        proc = context.Process(target=DetokenizerProc.run_detokenizer,
-                               kwargs=process_kwargs)
-        proc.start()
-        
-        # Wait for startup.
-        if reader.recv()["status"] != "READY":
-            raise RuntimeError(
-                "Detokenizer initalization failed. See root cause above."
-            )
-
-        return BackgroundProcHandle(proc=proc,
-                                    input_path=input_path,
-                                    output_path=output_path)
     
     @staticmethod
     def run_detokenizer(*args, **kwargs):
@@ -475,44 +432,32 @@ def run_busy_loop(self):
                         decoder=decoder,
                     )
 
-class MPDetokenizerClient:
+class MPDetokenizerClient(MPBackgroundProcess):
+    """Client for multi-proc Detokenizer."""
     
     def __init__(self,
-                 *args,
+                 input_path: str,
+                 output_path: str,
                  from_engine_core_path: str,
                  to_engine_core_path: str,
-                 **kwargs):
-        
-        # ZMQ setup.
-        self.ctx = zmq.asyncio.Context(2)
-
-        # Get input (DetokenizerRequest) to Detokenizer.
-        input_path = get_open_zmq_ipc_path()
-        self.input_socket = make_zmq_socket(
-            self.ctx,
-            input_path,
-            zmq.PUSH,
-        )
+                 tokenizer_name: str,
+                 tokenizer_mode: str = "auto",
+                 trust_remote_code: bool = False,
+                 revision: Optional[str] = None):
 
-        # Get output (RequestOutput) from Detokenizer.
-        output_path = get_open_zmq_ipc_path()
-        self.output_socket = make_zmq_socket(self.ctx,
-            output_path,
-            zmq.PULL,
-        )
+        super().__init__()
 
-        # Start Detokenizer in background process.
-        self.proc_handle: Optional[BackgroundProcHandle]
-        self.proc_handle = DetokenizerProc.make_detokenizer_process(
-            *args,
-            from_engine_core_path=from_engine_core_path,
-            to_engine_core_path=to_engine_core_path,
+        self.proc_handle = MPBackgroundProcess.wait_for_startup(
             input_path=input_path,
             output_path=output_path,
-            **kwargs,
+            process_name="Detokenizer",
+            target_fn=DetokenizerProc.run_detokenizer,
+            process_kwargs={
+                "from_engine_core_path": from_engine_core_path,
+                "to_engine_core_path": to_engine_core_path,
+                "tokenizer_name": tokenizer_name,
+                "tokenizer_mode": tokenizer_mode,
+                "trust_remote_code": trust_remote_code,
+                "revision": revision,
+            },
         )
-    
-    def shutdown(self):
-        if hasattr(self, "proc_handle") and self.proc_handle:
-            self.proc_handle.shutdown()
-            self.proc_handle = None
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 53a4c2dc949db..d52c343966353 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,15 +1,18 @@
+import os
+import weakref
+from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
-from multiprocessing.connection import Connection
-
 from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
-                    overload, Callable)
+from typing import (Any, Generic, Dict, Iterator, List, Optional, TypeVar, 
+                    Union, Callable, overload)
 
 import zmq
 import zmq.asyncio
 
 from vllm.logger import init_logger
+from vllm.utils import kill_process_tree
+from vllm.executor.multiproc_worker_utils import get_mp_context
 
 logger = init_logger(__name__)
 
@@ -127,11 +130,6 @@ def zmq_socket_ctx(
     finally:
         ctx.destroy(linger=0)
 
-from multiprocessing.process import BaseProcess
-from vllm.utils import kill_process_tree
-import os
-import weakref
-from dataclasses import dataclass
 
 @dataclass
 class BackgroundProcHandle:
@@ -157,11 +155,11 @@ def shutdown(self):
 
 
 class MPBackgroundProcess:
-    
-    def __init__(self, *args, fn: Callable, input_path: str, output_path: str, **kwargs):
-        # Start EngineCore in background process.
+
+    READY_STR = "READY"
+
+    def __init__(self):
         self.proc_handle: Optional[BackgroundProcHandle]
-        self.proc_handle = fn(*args, input_path, output_path, kwargs)
         self._finalizer = weakref.finalize(self, self.shutdown)
 
     def __del__(self):
@@ -170,4 +168,36 @@ def __del__(self):
     def shutdown(self):
         if hasattr(self, "proc_handle") and self.proc_handle:
             self.proc_handle.shutdown()
-            self.proc_handle = None
\ No newline at end of file
+            self.proc_handle = None
+
+    @staticmethod
+    def wait_for_startup(
+        input_path: str,
+        output_path: str,
+        process_name: str,
+        target_fn: Callable,
+        process_kwargs: Dict[Any, Any],
+    ) -> "MPBackgroundProcess":
+        context = get_mp_context()
+        reader, writer = context.Pipe(duplex=False)
+
+        assert ("ready_pipe" not in process_kwargs and
+                "input_path" not in process_kwargs and
+                "output_path" not in process_kwargs)
+        process_kwargs["ready_pipe"] = writer
+        process_kwargs["input_path"] = input_path
+        process_kwargs["output_path"] = output_path
+
+        # Run Detokenizer busy loop in background process.
+        proc = context.Process(target=target_fn,
+                               kwargs=process_kwargs)
+        proc.start()
+        
+        # Wait for startup.
+        if reader.recv()["status"] != "READY":
+            raise RuntimeError(
+                f"{process_name} initalization failed. "
+                "See root cause above."
+            )
+
+        return BackgroundProcHandle(proc, input_path, output_path)

From a19cb83c93ea7a53056f4a06fa2e87e988b5a381 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 20:58:39 +0000
Subject: [PATCH 112/132] cleanup

---
 vllm/entrypoints/openai/api_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ed3634a50e7e2..922f036a7f6b3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -671,11 +671,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # workaround to ensure user has enough fds available for uvicorn + ipc
     set_ulimit()
 
-    def sigterm_handler(*_) -> None:
+    def signal_handler(*_) -> None:
         # Interrupt server on sigterm while initializing
         raise KeyboardInterrupt("terminated")
 
-    signal.signal(signal.SIGTERM, sigterm_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
 
     # The child processes will send SIGQUIT to this process when
     # any error happens. This process then clean up the whole tree.

From 1695fddf712de610d020d3aa322bccaeff2dd2e5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 20:58:56 +0000
Subject: [PATCH 113/132] cleaning

---
 vllm/entrypoints/openai/api_server.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 922f036a7f6b3..76d9a2bd714cd 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -684,7 +684,6 @@ def sigquit_handler(signum, frame):
 
     signal.signal(signal.SIGQUIT, sigquit_handler)
 
-
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 

From b2f845b1f60e241b58e1773f8d54d40dca29032d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 22:10:10 +0000
Subject: [PATCH 114/132] updated

---
 vllm/v1/engine/__init__.py    | 10 +++++
 vllm/v1/engine/async_llm.py   | 67 +++++++++++++++++-------------
 vllm/v1/engine/core.py        | 10 +++--
 vllm/v1/engine/detokenizer.py | 76 +++++++++++++++--------------------
 vllm/v1/utils.py              |  8 ++--
 5 files changed, 91 insertions(+), 80 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3af0219db1c14..a99f8a617fd8f 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,3 +1,4 @@
+import enum
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
@@ -60,3 +61,12 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
+
+
+class EngineRequestType(enum.Enum):	
+    """	
+    Request types defined as hex byte strings, so it can be sent over sockets	
+    without separate encoding step.	
+    """	
+    FROM_ENGINE_CORE = b'\x00'
+    FROM_ENGINE = b'\x01'	
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 85f4b29021be6..acf5e028475cd 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -18,8 +18,9 @@
 import asyncio
 import zmq
 import zmq.asyncio
+import pickle
 
-from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
+from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -37,12 +38,12 @@
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import get_open_zmq_ipc_path
-from vllm.v1.engine import EngineAbortRequest
+from vllm.v1.engine import EngineAbortRequest, EngineRequestType
 from vllm.v1.engine.core import MPEngineCoreClient
 from vllm.v1.engine.detokenizer import MPDetokenizerClient
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.utils import zmq_socket_ctx, make_zmq_socket
+from vllm.v1.utils import make_zmq_socket
 
 logger = init_logger(__name__)
 
@@ -89,21 +90,22 @@ def __init__(
         )
 
         # IPC paths.
-        from_engine_core_path = get_open_zmq_ipc_path()
+        to_detokenizer_path = get_open_zmq_ipc_path()
         to_engine_core_path = get_open_zmq_ipc_path()
-        self.to_detokenizer_path = get_open_zmq_ipc_path()
-        self.from_detokenizer_path = get_open_zmq_ipc_path()
+        to_llm_engine_path = get_open_zmq_ipc_path()
+        
 
         # Detokenizer IPC.
         self.ctx = zmq.asyncio.Context(io_threads=2)
+        self.from_detokenizer = make_zmq_socket(
+            self.ctx, to_llm_engine_path, zmq.PULL)
         self.to_detokenizer = make_zmq_socket(
-            self.ctx, self.to_detokenizer_path, zmq.PULL)
-
+            self.ctx, to_detokenizer_path, zmq.PUSH)
+        
         # Detokenizer (background process).
         self.detokenizer_client = MPDetokenizerClient(
-            output_path=self.from_detokenizer_path,
-            input_path=self.to_detokenizer_path,
-            from_engine_core_path=from_engine_core_path,
+            output_path=to_llm_engine_path,
+            input_path=to_detokenizer_path,
             to_engine_core_path=to_engine_core_path,
             tokenizer_name=vllm_config.model_config.tokenizer,
             tokenizer_mode=vllm_config.model_config.tokenizer_mode,
@@ -114,7 +116,7 @@ def __init__(
         # EngineCore (background process).
         self.engine_core_client = MPEngineCoreClient(
             input_path=to_engine_core_path,
-            output_path=from_engine_core_path,
+            output_path=to_detokenizer_path,
             vllm_config=vllm_config,
             executor_class=executor_class,
             usage_context=usage_context,
@@ -207,8 +209,8 @@ async def add_request(
 
         # 3) Send to Detokenizer (which forwards to EngineCore).
         # Note: we forward the request rather than sending to each
-        # process separately to avoid race conditions in Detokenizer.
-        await self.to_detokenizer.send_pyobj(engine_request)
+        # process separately to avoid race conditions in Detokenizer).
+        await self.send_to_detokenizer(engine_request)
 
         return self.rid_to_queue[request_id]
 
@@ -286,27 +288,28 @@ async def generate(
     async def output_handler_loop(self):
         """Background loop: pulls from Detokenizer and push to Queues."""
 
-        with zmq_socket_ctx(self.from_detokenizer_path, zmq.PULL) as socket:
-            while True:
-                # Note: use socket directly to avoid calling await multiple
-                # times, which causes too much task switching at high QPS.
-                outputs: List[RequestOutput] = []
-                outputs = await socket.recv_pyobj()
-
-                for out in outputs:
-                    # Note: it is possible that a request was aborted
-                    # due to client cancellation while EngineCoreOutputs
-                    # are still flowing, so we just ignore.
-                    if out.request_id in self.rid_to_queue:
-                        self.rid_to_queue[out.request_id].put_nowait(out)
+        epoch = 0
+        while True:
+            logger.info(f"EPOCH: {epoch}")
+            epoch+=1
+            # Note: use socket directly to avoid calling await multiple
+            # times, which causes too much task switching at high QPS.
+            outputs: List[RequestOutput] = []
+            outputs = await self.from_detokenizer.recv_pyobj()
+
+            for out in outputs:
+                # Note: it is possible that a request was aborted
+                # due to client cancellation while EngineCoreOutputs
+                # are still flowing, so we just ignore.
+                if out.request_id in self.rid_to_queue:
+                    self.rid_to_queue[out.request_id].put_nowait(out)
             
 
     async def abort(self, request_id: str):
         """Abort request if the client cancels the request."""
 
         # Send abort to Detokenizer (which will fwd to EngineCore).
-        await self.detokenizer_client.input_socket.send_pyobj(
-            EngineAbortRequest([request_id]))
+        await self.send_to_detokenizer(EngineAbortRequest([request_id]))
 
         # Remove from request output queues.
         if request_id in self.rid_to_queue:
@@ -314,6 +317,12 @@ async def abort(self, request_id: str):
 
         if self.log_requests:
             logger.info("Aborted %s.", request_id)
+    
+    async def send_to_detokenizer(self, object: Any):
+        """Send object to Detokenizer with a FROM_ENGINE flag."""
+
+        msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object))
+        await self.to_detokenizer.send_multipart(msg, copy=False)
 
     def encode(
         self,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 9781ceaf04e56..151cafbe62ae1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -20,7 +20,8 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineAbortRequest, EngineRequest,
-                            EngineProfileRequest, EngineRequestUnion)
+                            EngineRequestType, EngineProfileRequest,
+                            EngineRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -162,7 +163,7 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        ready_pipe.send({"status": EngineCoreProc.READY_STR})
+        ready_pipe.send({"status": "READY"})
 
 
     @staticmethod
@@ -226,6 +227,7 @@ def run_busy_loop(self):
                         self._handle_client_request(req)
                         break
                     except queue.Empty:
+                        logger.info(f"EPOCH: {epoch}")
                         self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
                     except BaseException:
@@ -293,7 +295,8 @@ def process_output_socket(self, output_path: str):
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
                 encoder.encode_into(outputs, buffer)
-                socket.send_multipart((buffer, ), copy=False)
+                msg = (EngineRequestType.FROM_ENGINE_CORE.value, buffer)
+                socket.send_multipart(msg, copy=False)
 
 
 class MPEngineCoreClient(MPBackgroundProcess):
@@ -319,3 +322,4 @@ def __init__(self,
                 "usage_context": usage_context,
             },
         )
+        print("STARTED ENGINE CORE")
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 455fb442f6c51..0511b8ccdfa12 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -14,11 +14,10 @@
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
-                        get_exception_traceback)
-from vllm.v1.engine import (EngineCoreOutputs,
+from vllm.utils import get_exception_traceback
+from vllm.v1.engine import (EngineCoreOutputs, EngineRequestType,
                             EngineRequest, EngineAbortRequest)
-from vllm.v1.utils import zmq_socket_ctx, MPBackgroundProcess
+from vllm.v1.utils import make_zmq_socket, MPBackgroundProcess
 
 logger = init_logger(__name__)
 
@@ -287,27 +286,23 @@ def step(
 class DetokenizerProc(Detokenizer):
     """ZMQ-wrapper for running Detokenizer in background process."""
 
-    READY_STR = "READY"
-
     def __init__(
         self,
         *args,
-        from_engine_core_path: str,
-        to_engine_core_path: str,
         input_path: str,
         output_path: str,
+        to_engine_core_path: str,
         ready_pipe: Connection,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
 
-        self.from_engine_core_path = from_engine_core_path
-        self.to_engine_core_path = to_engine_core_path
         self.input_path = input_path
         self.output_path = output_path
+        self.to_engine_core_path = to_engine_core_path
 
         # Send Readiness signal to DetokenizerClient.
-        ready_pipe.send({"status": DetokenizerProc.READY_STR})
+        ready_pipe.send({"status": "READY"})
 
     
     @staticmethod
@@ -350,13 +345,12 @@ def signal_handler(signum, frame):
 
     def _handle_from_llm_engine(
         self, 
-        from_llm_engine: zmq.Socket,
+        request_bytes: bytes,
         to_engine_core: zmq.Socket,
     ) -> None:
         """Handle EngineRequest from the LLMEngine."""
 
-        pickled_req = from_llm_engine.recv()
-        req = pickle.loads(pickled_req)
+        req = pickle.loads(request_bytes)
 
         if isinstance(req, EngineRequest):
             self.add_request(req)
@@ -366,11 +360,11 @@ def _handle_from_llm_engine(
             raise ValueError(f"Unknown type: {req}")
 
         # Forward to EngineCore.
-        to_engine_core.send(pickled_req)
+        to_engine_core.send(request_bytes)
     
     def _handle_from_engine_core(
         self,
-        from_engine_core: zmq.Socket,
+        output_bytes: bytes,
         to_engine_core: zmq.Socket,
         to_llm_engine: zmq.Socket,
         decoder: msgspec.msgpack.Decoder,
@@ -378,8 +372,7 @@ def _handle_from_engine_core(
         """Handle Outputs from the EngineCore."""
 
         # Deserialize the EngineOutput (use msgpack for performance).
-        (frame, ) = from_engine_core.recv_multipart(copy=False)
-        outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
+        outputs: EngineCoreOutputs = decoder.decode(output_bytes)
 
         # Detokenize.
         request_outputs, requests_to_abort = self.step(outputs)
@@ -398,39 +391,37 @@ def run_busy_loop(self):
 
         decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
-        with (zmq_socket_ctx(self.from_engine_core_path, zmq.PULL) as from_engine_core, 
-              zmq_socket_ctx(self.to_engine_core_path, zmq.PUSH) as to_engine_core,
-              zmq_socket_ctx(self.input_path, zmq.PULL) as from_llm_engine,
-              zmq_socket_ctx(self.output_path, zmq.PUSH) as to_llm_engine):
-
-            # TODO(rob): avoid poll by having both EngineCore and 
-            # LLMEngine send to the same socket.
-            poller = zmq.Poller()
-            poller.register(from_engine_core, zmq.POLLIN)
-            poller.register(from_llm_engine, zmq.POLLIN)
-
+        ctx = zmq.Context(io_threads=2)
+        try:
+            input_socket = make_zmq_socket(ctx, self.input_path, zmq.PULL)
+            to_llm_engine = make_zmq_socket(ctx, self.output_path, zmq.PUSH)
+            to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, zmq.PUSH)
             epoch = 0
             while True:
-                logger.info(f"EPOCH: {epoch}")
+                (msg_type, msg_bytes) = input_socket.recv_multipart()
 
-                socks = dict(poller.poll())
+                # Handle message from LLMEngine (Abort or New Request).
+                if msg_type == EngineRequestType.FROM_ENGINE.value:
+                    self._handle_from_llm_engine(msg_bytes, to_engine_core)
 
-                # Handle input from LLMEngine.
-                if from_llm_engine in socks:
-                    self._handle_from_llm_engine(
-                        from_llm_engine=from_llm_engine,
-                        to_engine_core=to_engine_core,
-                    )
-
-                # Handle output from EngineCoreOutput.
-                if from_engine_core in socks:
+                # Handle message from EngineCore (EngineCoreOutputs).
+                elif msg_type == EngineRequestType.FROM_ENGINE_CORE.value:
                     epoch += 1
                     self._handle_from_engine_core(
-                        from_engine_core=from_engine_core,
+                        output_bytes=msg_bytes,
                         to_engine_core=to_engine_core,
                         to_llm_engine=to_llm_engine,
                         decoder=decoder,
                     )
+                else:
+                    raise ValueError(f"Unknown Message Type: {msg_type}")
+
+        except KeyboardInterrupt:
+            logger.debug("Got Keyboard Interrupt.")
+
+        finally:
+            ctx.destroy(linger=0)
+
 
 class MPDetokenizerClient(MPBackgroundProcess):
     """Client for multi-proc Detokenizer."""
@@ -438,7 +429,6 @@ class MPDetokenizerClient(MPBackgroundProcess):
     def __init__(self,
                  input_path: str,
                  output_path: str,
-                 from_engine_core_path: str,
                  to_engine_core_path: str,
                  tokenizer_name: str,
                  tokenizer_mode: str = "auto",
@@ -453,7 +443,6 @@ def __init__(self,
             process_name="Detokenizer",
             target_fn=DetokenizerProc.run_detokenizer,
             process_kwargs={
-                "from_engine_core_path": from_engine_core_path,
                 "to_engine_core_path": to_engine_core_path,
                 "tokenizer_name": tokenizer_name,
                 "tokenizer_mode": tokenizer_mode,
@@ -461,3 +450,4 @@ def __init__(self,
                 "revision": revision,
             },
         )
+        print("STARTED DETOKENIZER")
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index d52c343966353..508474ea53f57 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -103,11 +103,11 @@ def make_zmq_socket(ctx: Union[zmq.asyncio.Context, zmq.Context], path: str,
     if type == zmq.PULL:
         socket.setsockopt(zmq.RCVHWM, 0)
         socket.setsockopt(zmq.RCVBUF, buf_size)
-        socket.connect(path)
+        socket.bind(path)
     elif type == zmq.PUSH:
         socket.setsockopt(zmq.SNDHWM, 0)
         socket.setsockopt(zmq.SNDBUF, buf_size)
-        socket.bind(path)
+        socket.connect(path)
     else:
         raise ValueError(f"Unknown Socket Type: {type}")
 
@@ -125,7 +125,7 @@ def zmq_socket_ctx(
         yield make_zmq_socket(ctx, path, type)
 
     except KeyboardInterrupt:
-        logger.debug("Worker had Keyboard Interrupt.")
+        logger.debug("Got Keyboard Interrupt.")
 
     finally:
         ctx.destroy(linger=0)
@@ -156,8 +156,6 @@ def shutdown(self):
 
 class MPBackgroundProcess:
 
-    READY_STR = "READY"
-
     def __init__(self):
         self.proc_handle: Optional[BackgroundProcHandle]
         self._finalizer = weakref.finalize(self, self.shutdown)

From 12df407bd991e6255bd1bea37d82ead4e7ad248a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 22:13:15 +0000
Subject: [PATCH 115/132] remove epoch

---
 vllm/v1/engine/async_llm.py   | 3 ---
 vllm/v1/engine/core.py        | 7 -------
 vllm/v1/engine/detokenizer.py | 2 --
 3 files changed, 12 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index acf5e028475cd..4e791e8f06565 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -288,10 +288,7 @@ async def generate(
     async def output_handler_loop(self):
         """Background loop: pulls from Detokenizer and push to Queues."""
 
-        epoch = 0
         while True:
-            logger.info(f"EPOCH: {epoch}")
-            epoch+=1
             # Note: use socket directly to avoid calling await multiple
             # times, which causes too much task switching at high QPS.
             outputs: List[RequestOutput] = []
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 151cafbe62ae1..e4339222f7539 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -212,13 +212,7 @@ def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
         # Loop until process is sent a SIGINT or SIGTERM
-        epoch = 0
         while True:
-            logger.info(f"EPOCH: {epoch}")
-            epoch += 1
-            # if epoch == 10:
-            #     raise ValueError("Died")
-
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
@@ -227,7 +221,6 @@ def run_busy_loop(self):
                         self._handle_client_request(req)
                         break
                     except queue.Empty:
-                        logger.info(f"EPOCH: {epoch}")
                         self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
                     except BaseException:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 0511b8ccdfa12..870c4a7501a36 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -396,7 +396,6 @@ def run_busy_loop(self):
             input_socket = make_zmq_socket(ctx, self.input_path, zmq.PULL)
             to_llm_engine = make_zmq_socket(ctx, self.output_path, zmq.PUSH)
             to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, zmq.PUSH)
-            epoch = 0
             while True:
                 (msg_type, msg_bytes) = input_socket.recv_multipart()
 
@@ -406,7 +405,6 @@ def run_busy_loop(self):
 
                 # Handle message from EngineCore (EngineCoreOutputs).
                 elif msg_type == EngineRequestType.FROM_ENGINE_CORE.value:
-                    epoch += 1
                     self._handle_from_engine_core(
                         output_bytes=msg_bytes,
                         to_engine_core=to_engine_core,

From b7843c93060c7684c37f6117261ab7a1e0df6d05 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 22:17:46 +0000
Subject: [PATCH 116/132] update

---
 vllm/v1/engine/__init__.py    |  6 +++---
 vllm/v1/engine/async_llm.py   | 22 ++++++++++------------
 vllm/v1/engine/core.py        | 17 ++++++-----------
 vllm/v1/engine/detokenizer.py | 26 ++++++++++++++------------
 vllm/v1/engine/llm_engine.py  |  6 +++---
 vllm/v1/utils.py              | 19 ++++++++-----------
 6 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index a99f8a617fd8f..0e104118c9ff9 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -63,10 +63,10 @@ class EngineCoreOutputs(
     outputs: List[EngineCoreOutput]
 
 
-class EngineRequestType(enum.Enum):	
+class EngineRequestType(enum.Enum):
     """	
     Request types defined as hex byte strings, so it can be sent over sockets	
     without separate encoding step.	
-    """	
+    """
     FROM_ENGINE_CORE = b'\x00'
-    FROM_ENGINE = b'\x01'	
+    FROM_ENGINE = b'\x01'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 4e791e8f06565..7a682f79e7972 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -20,7 +20,8 @@
 import zmq.asyncio
 import pickle
 
-from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
+from typing import (Any, AsyncGenerator, Dict, List, Mapping, Optional, Type,
+                    Union)
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -93,15 +94,14 @@ def __init__(
         to_detokenizer_path = get_open_zmq_ipc_path()
         to_engine_core_path = get_open_zmq_ipc_path()
         to_llm_engine_path = get_open_zmq_ipc_path()
-        
 
         # Detokenizer IPC.
         self.ctx = zmq.asyncio.Context(io_threads=2)
-        self.from_detokenizer = make_zmq_socket(
-            self.ctx, to_llm_engine_path, zmq.PULL)
-        self.to_detokenizer = make_zmq_socket(
-            self.ctx, to_detokenizer_path, zmq.PUSH)
-        
+        self.from_detokenizer = make_zmq_socket(self.ctx, to_llm_engine_path,
+                                                zmq.PULL)
+        self.to_detokenizer = make_zmq_socket(self.ctx, to_detokenizer_path,
+                                              zmq.PUSH)
+
         # Detokenizer (background process).
         self.detokenizer_client = MPDetokenizerClient(
             output_path=to_llm_engine_path,
@@ -162,7 +162,7 @@ def shutdown(self):
 
         if ctx := getattr(self, "ctx", None):
             ctx.destroy(linger=0)
-        
+
         if output_handler := getattr(self, "output_hander", None):
             output_handler.cancel()
 
@@ -278,13 +278,12 @@ async def generate(
                 yield out
 
         # Client request cancellation is handled through calling
-        # task.cancel() on generate(). Calling self.abort() forwards the 
+        # task.cancel() on generate(). Calling self.abort() forwards the
         # cancellation to the EngineCore and Detokenizer.
         except asyncio.CancelledError:
             await self.abort(request_id)
             raise
 
-
     async def output_handler_loop(self):
         """Background loop: pulls from Detokenizer and push to Queues."""
 
@@ -300,7 +299,6 @@ async def output_handler_loop(self):
                 # are still flowing, so we just ignore.
                 if out.request_id in self.rid_to_queue:
                     self.rid_to_queue[out.request_id].put_nowait(out)
-            
 
     async def abort(self, request_id: str):
         """Abort request if the client cancels the request."""
@@ -314,7 +312,7 @@ async def abort(self, request_id: str):
 
         if self.log_requests:
             logger.info("Aborted %s.", request_id)
-    
+
     async def send_to_detokenizer(self, object: Any):
         """Send object to Detokenizer with a FROM_ENGINE flag."""
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e4339222f7539..a06fef170e8d5 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -11,7 +11,6 @@
 from multiprocessing.connection import Connection
 
 from vllm.config import CacheConfig, VllmConfig
-from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
@@ -108,8 +107,8 @@ def add_request(self, request: EngineRequest):
     def abort_requests(self, request_ids: List[str]):
         """Abort requests from the scheduler."""
 
-        # TODO: The scheduler doesn't really need to know the	
-        # specific finish reason, TBD whether we propagate that	
+        # TODO: The scheduler doesn't really need to know the
+        # specific finish reason, TBD whether we propagate that
         # (i.e. client-aborted vs stop criteria met).
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
@@ -150,7 +149,7 @@ def __init__(
         super().__init__(vllm_config, executor_class, usage_context)
 
         # Background Threads and Queues for IO. These enable us to
-        # overlap ZMQ IO with GPU since they release the GIL and 
+        # overlap ZMQ IO with GPU since they release the GIL and
         # some serialization/deserialization with the model forward.
         # Threads handle Socket <-> Queues and busy_loop uses Queues.
         self.input_queue: queue.Queue[EngineRequestUnion] = queue.Queue()
@@ -165,7 +164,6 @@ def __init__(
         # Send Readiness signal to EngineClient.
         ready_pipe.send({"status": "READY"})
 
-
     @staticmethod
     def run_engine_core(*args, **kwargs):
         """Launch EngineCore busy loop in background process."""
@@ -226,7 +224,7 @@ def run_busy_loop(self):
                     except BaseException:
                         raise
 
-            # 2) Handle any new inputs.
+            # 2) Handle any new client requests (Abort or Add).
             while not self.input_queue.empty():
                 req = self.input_queue.get_nowait()
                 self._handle_client_request(req)
@@ -295,11 +293,8 @@ def process_output_socket(self, output_path: str):
 class MPEngineCoreClient(MPBackgroundProcess):
     """Client for multi-proc EngineCore."""
 
-    def __init__(self,
-                 input_path: str,
-                 output_path: str,
-                 vllm_config: VllmConfig,
-                 executor_class: Type[Executor],
+    def __init__(self, input_path: str, output_path: str,
+                 vllm_config: VllmConfig, executor_class: Type[Executor],
                  usage_context: UsageContext):
 
         super().__init__()
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 870c4a7501a36..384e1a69170af 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -5,7 +5,7 @@
 import signal
 from dataclasses import dataclass
 from multiprocessing.connection import Connection
-from typing import Dict, Iterable, List, Optional, Tuple,Union
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
@@ -23,6 +23,7 @@
 
 POLLING_TIMEOUT_MS = 5000
 
+
 @dataclass
 class IncrementalDetokenizer:
 
@@ -90,7 +91,8 @@ def from_new_request(
             # NOTE(Nick): could we take ownership of it though?
             token_ids=request.prompt_token_ids.copy(),
             stop=stops,
-            include_stop_str_in_output=sampling_params.include_stop_str_in_output,
+            include_stop_str_in_output=sampling_params.
+            include_stop_str_in_output,
             prefix_offset=prefix_offset,
             read_offset=read_offset,
             skip_special_tokens=sampling_params.skip_special_tokens,
@@ -247,7 +249,8 @@ def add_request(
         self.request_states[request.request_id] = request_state
 
     def step(
-        self, encore_core_outputs: EngineCoreOutputs,
+        self,
+        encore_core_outputs: EngineCoreOutputs,
     ) -> Tuple[List[RequestOutput], List[str]]:
         """Update state and make RequestOutputs for the LLMEngine."""
 
@@ -283,6 +286,7 @@ def step(
         # Return to EngineClient.
         return request_outputs, requests_to_abort
 
+
 class DetokenizerProc(Detokenizer):
     """ZMQ-wrapper for running Detokenizer in background process."""
 
@@ -304,7 +308,6 @@ def __init__(
         # Send Readiness signal to DetokenizerClient.
         ready_pipe.send({"status": "READY"})
 
-    
     @staticmethod
     def run_detokenizer(*args, **kwargs):
         """Launch Detokenizer busy loop in background process."""
@@ -336,7 +339,7 @@ def signal_handler(signum, frame):
 
         except Exception:
             traceback = get_exception_traceback()
-            logger.error(f"Detokenizer hit an exception: {traceback}")
+            logger.error("Detokenizer hit an exception: %s", traceback)
             parent_process.send_signal(signal.SIGQUIT)
 
         finally:
@@ -344,7 +347,7 @@ def signal_handler(signum, frame):
                 detokenizer = None
 
     def _handle_from_llm_engine(
-        self, 
+        self,
         request_bytes: bytes,
         to_engine_core: zmq.Socket,
     ) -> None:
@@ -361,7 +364,7 @@ def _handle_from_llm_engine(
 
         # Forward to EngineCore.
         to_engine_core.send(request_bytes)
-    
+
     def _handle_from_engine_core(
         self,
         output_bytes: bytes,
@@ -382,9 +385,7 @@ def _handle_from_engine_core(
 
         # Abort requests that finished due to stop strings.
         if len(requests_to_abort) > 0:
-            to_engine_core.send_pyobj(
-                EngineAbortRequest(requests_to_abort))
-        
+            to_engine_core.send_pyobj(EngineAbortRequest(requests_to_abort))
 
     def run_busy_loop(self):
         """Core busy loop of the Detokenizer."""
@@ -395,7 +396,8 @@ def run_busy_loop(self):
         try:
             input_socket = make_zmq_socket(ctx, self.input_path, zmq.PULL)
             to_llm_engine = make_zmq_socket(ctx, self.output_path, zmq.PUSH)
-            to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path, zmq.PUSH)
+            to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path,
+                                             zmq.PUSH)
             while True:
                 (msg_type, msg_bytes) = input_socket.recv_multipart()
 
@@ -423,7 +425,7 @@ def run_busy_loop(self):
 
 class MPDetokenizerClient(MPBackgroundProcess):
     """Client for multi-proc Detokenizer."""
-    
+
     def __init__(self,
                  input_path: str,
                  output_path: str,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index caef661320cb2..be660a4023b30 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -86,7 +86,7 @@ def __init__(
                 executor_class=executor_class,
                 usage_context=usage_context,
             )
-        
+
         else:
             # Detokenizer (in process).
             self.detokenizer = Detokenizer(
@@ -190,7 +190,7 @@ def add_request(
             self.engine_core.add_request(engine_request)
 
     def step(self) -> List[RequestOutput]:
-        
+
         if self.multiprocess_mode:
             # Get next output from the Detokenizer.
             return self.detokenizer_client.output_socket.recv_pyobj()
@@ -203,7 +203,7 @@ def step(self) -> List[RequestOutput]:
             # Abort any requests that hit a stop string.
             if requests_to_abort:
                 self.abort_request(requests_to_abort)
-            
+
             return request_outputs
 
     def get_model_config(self):
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 508474ea53f57..fde4601361256 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -4,7 +4,7 @@
 from multiprocessing.process import BaseProcess
 from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import (Any, Generic, Dict, Iterator, List, Optional, TypeVar, 
+from typing import (Any, Generic, Dict, Iterator, List, Optional, TypeVar,
                     Union, Callable, overload)
 
 import zmq
@@ -179,23 +179,20 @@ def wait_for_startup(
         context = get_mp_context()
         reader, writer = context.Pipe(duplex=False)
 
-        assert ("ready_pipe" not in process_kwargs and
-                "input_path" not in process_kwargs and
-                "output_path" not in process_kwargs)
+        assert ("ready_pipe" not in process_kwargs
+                and "input_path" not in process_kwargs
+                and "output_path" not in process_kwargs)
         process_kwargs["ready_pipe"] = writer
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
 
         # Run Detokenizer busy loop in background process.
-        proc = context.Process(target=target_fn,
-                               kwargs=process_kwargs)
+        proc = context.Process(target=target_fn, kwargs=process_kwargs)
         proc.start()
-        
+
         # Wait for startup.
         if reader.recv()["status"] != "READY":
-            raise RuntimeError(
-                f"{process_name} initalization failed. "
-                "See root cause above."
-            )
+            raise RuntimeError(f"{process_name} initalization failed. "
+                               "See root cause above.")
 
         return BackgroundProcHandle(proc, input_path, output_path)

From a6368a7826579ab492dd8da4b20c78701af82918 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 22:24:00 +0000
Subject: [PATCH 117/132] fix typing

---
 vllm/v1/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index fde4601361256..1544dd104c434 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -175,7 +175,7 @@ def wait_for_startup(
         process_name: str,
         target_fn: Callable,
         process_kwargs: Dict[Any, Any],
-    ) -> "MPBackgroundProcess":
+    ) -> BackgroundProcHandle:
         context = get_mp_context()
         reader, writer = context.Pipe(duplex=False)
 

From 315efeadba9ebf407227f40c9d5ef8ee2ae3e271 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 22:25:28 +0000
Subject: [PATCH 118/132] remove prints

---
 vllm/v1/engine/core.py        | 1 -
 vllm/v1/engine/detokenizer.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a06fef170e8d5..136801ffbd617 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -310,4 +310,3 @@ def __init__(self, input_path: str, output_path: str,
                 "usage_context": usage_context,
             },
         )
-        print("STARTED ENGINE CORE")
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 384e1a69170af..ed44ad1ffb318 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -450,4 +450,3 @@ def __init__(self,
                 "revision": revision,
             },
         )
-        print("STARTED DETOKENIZER")

From 740567fbcecce5afc0f7779e1a06618e6d02b0e7 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 22:30:07 +0000
Subject: [PATCH 119/132] updated

---
 vllm/v1/engine/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 995492ad85df2..72d4a1ecf4511 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,5 +1,5 @@
 import time
-from typing import Mapping, Optional, Tuple, Union
+from typing import Mapping, Optional, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,

From cbc043ede0390474a007a87aed2deb28464b4b4f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 23:05:47 +0000
Subject: [PATCH 120/132] fixup

git
---
 vllm/v1/engine/async_llm.py   |  8 +-------
 vllm/v1/engine/detokenizer.py |  2 +-
 vllm/v1/engine/llm_engine.py  | 33 ++++++++++++++++++++++++++-------
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7a682f79e7972..9a3a5530cb052 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -39,7 +39,7 @@
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import get_open_zmq_ipc_path
-from vllm.v1.engine import EngineAbortRequest, EngineRequestType
+from vllm.v1.engine import EngineAbortRequest
 from vllm.v1.engine.core import MPEngineCoreClient
 from vllm.v1.engine.detokenizer import MPDetokenizerClient
 from vllm.v1.engine.processor import Processor
@@ -313,12 +313,6 @@ async def abort(self, request_id: str):
         if self.log_requests:
             logger.info("Aborted %s.", request_id)
 
-    async def send_to_detokenizer(self, object: Any):
-        """Send object to Detokenizer with a FROM_ENGINE flag."""
-
-        msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object))
-        await self.to_detokenizer.send_multipart(msg, copy=False)
-
     def encode(
         self,
         prompt: PromptType,
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index ed44ad1ffb318..b54cdd80db4ae 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -5,7 +5,7 @@
 import signal
 from dataclasses import dataclass
 from multiprocessing.connection import Connection
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index be660a4023b30..a7223ae3571ba 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,7 +1,9 @@
-from typing import Dict, List, Mapping, Optional, Type, Union
-
+from typing import Any, Dict, List, Mapping, Optional, Type, Union
 from typing_extensions import TypeVar
 
+import zmq
+import pickle
+
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
@@ -18,10 +20,12 @@
     BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import get_open_zmq_ipc_path
+from vllm.v1.engine import EngineRequestType
 from vllm.v1.engine.core import EngineCore, MPEngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer, MPDetokenizerClient
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.utils import make_zmq_socket
 
 logger = init_logger(__name__)
 
@@ -65,12 +69,21 @@ def __init__(
 
         if self.multiprocess_mode:
             # IPC paths.
-            from_engine_core_path = get_open_zmq_ipc_path()
+            to_detokenizer_path = get_open_zmq_ipc_path()
             to_engine_core_path = get_open_zmq_ipc_path()
+            to_llm_engine_path = get_open_zmq_ipc_path()
+
+            # Detokenizer IPC.
+            self.ctx = zmq.Context(io_threads=2)
+            self.from_detokenizer = make_zmq_socket(
+                self.ctx, to_llm_engine_path, zmq.PULL)
+            self.to_detokenizer = make_zmq_socket(
+                self.ctx, to_detokenizer_path, zmq.PUSH)
 
             # Detokenizer (background process).
             self.detokenizer_client = MPDetokenizerClient(
-                from_engine_core_path=from_engine_core_path,
+                output_path=to_llm_engine_path,
+                input_path=to_detokenizer_path,
                 to_engine_core_path=to_engine_core_path,
                 tokenizer_name=vllm_config.model_config.tokenizer,
                 tokenizer_mode=vllm_config.model_config.tokenizer_mode,
@@ -81,7 +94,7 @@ def __init__(
             # EngineCore (background process).
             self.engine_core_client = MPEngineCoreClient(
                 input_path=to_engine_core_path,
-                output_path=from_engine_core_path,
+                output_path=to_detokenizer_path,
                 vllm_config=vllm_config,
                 executor_class=executor_class,
                 usage_context=usage_context,
@@ -183,7 +196,7 @@ def add_request(
             # Send to Detokenizer (which forwards to EngineCore).
             # Note: we forward the message rather than sending
             # to each process separately to avoid race conditions.
-            self.detokenizer_client.input_socket.send_pyobj(engine_request)
+            self.send_to_detokenizer(engine_request)
         else:
             # Add directly to Detokenizer and EngineCore.
             self.detokenizer.add_request(engine_request)
@@ -193,7 +206,7 @@ def step(self) -> List[RequestOutput]:
 
         if self.multiprocess_mode:
             # Get next output from the Detokenizer.
-            return self.detokenizer_client.output_socket.recv_pyobj()
+            return self.from_detokenizer.recv_pyobj()
         else:
             # Step EngineCore and Detokenizer.
             engine_core_outputs = self.engine_core.step()
@@ -206,6 +219,12 @@ def step(self) -> List[RequestOutput]:
 
             return request_outputs
 
+    def send_to_detokenizer(self, object: Any):
+        """Send object to Detokenizer with a FROM_ENGINE flag."""
+
+        msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object))
+        self.to_detokenizer.send_multipart(msg, copy=False)
+
     def get_model_config(self):
         return self.model_config
 

From 80610784ce4840d5ad72a9b557c8a468b6a2fc17 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 23:16:32 +0000
Subject: [PATCH 121/132] mypy

---
 vllm/v1/engine/async_llm.py            |  4 ++--
 vllm/v1/engine/core.py                 |  4 ++--
 vllm/v1/engine/detokenizer.py          | 27 ++++++++++++++------------
 vllm/v1/engine/llm_engine.py           | 12 +++++++-----
 vllm/v1/executor/multiproc_executor.py |  4 ++--
 vllm/v1/utils.py                       | 19 ++++++++++--------
 6 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 9a3a5530cb052..80ea0dc75234f 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -98,9 +98,9 @@ def __init__(
         # Detokenizer IPC.
         self.ctx = zmq.asyncio.Context(io_threads=2)
         self.from_detokenizer = make_zmq_socket(self.ctx, to_llm_engine_path,
-                                                zmq.PULL)
+                                                zmq.constants.PULL)
         self.to_detokenizer = make_zmq_socket(self.ctx, to_detokenizer_path,
-                                              zmq.PUSH)
+                                              zmq.constants.PUSH)
 
         # Detokenizer (background process).
         self.detokenizer_client = MPDetokenizerClient(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 136801ffbd617..6ce4a6621080e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -267,7 +267,7 @@ def _handle_client_request(self, request: EngineRequestUnion) -> None:
     def process_input_socket(self, input_path: str):
         """Input socket IO thread."""
 
-        with zmq_socket_ctx(input_path, zmq.PULL) as socket:
+        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # Push to input queue for core busy loop.
                 request = socket.recv_pyobj()
@@ -281,7 +281,7 @@ def process_output_socket(self, output_path: str):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with zmq_socket_ctx(output_path, zmq.PUSH) as socket:
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index b54cdd80db4ae..c3d3e7db6a338 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -15,8 +15,9 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import get_exception_traceback
-from vllm.v1.engine import (EngineCoreOutputs, EngineRequestType,
-                            EngineRequest, EngineAbortRequest)
+from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
+                            EngineRequestType, EngineRequest,
+                            EngineAbortRequest)
 from vllm.v1.utils import make_zmq_socket, MPBackgroundProcess
 
 logger = init_logger(__name__)
@@ -250,13 +251,13 @@ def add_request(
 
     def step(
         self,
-        encore_core_outputs: EngineCoreOutputs,
+        encore_core_outputs: List[EngineCoreOutput],
     ) -> Tuple[List[RequestOutput], List[str]]:
         """Update state and make RequestOutputs for the LLMEngine."""
 
         request_outputs: List[RequestOutput] = []
         requests_to_abort: List[str] = []
-        for engine_core_output in encore_core_outputs.outputs:
+        for engine_core_output in encore_core_outputs:
             request_id = engine_core_output.request_id
             detokenizer = self.request_states.get(request_id)
             if detokenizer is None:
@@ -349,7 +350,7 @@ def signal_handler(signum, frame):
     def _handle_from_llm_engine(
         self,
         request_bytes: bytes,
-        to_engine_core: zmq.Socket,
+        to_engine_core: zmq.constants.Socket,
     ) -> None:
         """Handle EngineRequest from the LLMEngine."""
 
@@ -368,14 +369,14 @@ def _handle_from_llm_engine(
     def _handle_from_engine_core(
         self,
         output_bytes: bytes,
-        to_engine_core: zmq.Socket,
-        to_llm_engine: zmq.Socket,
+        to_engine_core: zmq.constants.Socket,
+        to_llm_engine: zmq.constants.Socket,
         decoder: msgspec.msgpack.Decoder,
     ) -> None:
         """Handle Outputs from the EngineCore."""
 
         # Deserialize the EngineOutput (use msgpack for performance).
-        outputs: EngineCoreOutputs = decoder.decode(output_bytes)
+        outputs: List[EngineCoreOutput] = decoder.decode(output_bytes).outputs
 
         # Detokenize.
         request_outputs, requests_to_abort = self.step(outputs)
@@ -392,12 +393,14 @@ def run_busy_loop(self):
 
         decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
-        ctx = zmq.Context(io_threads=2)
+        ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
         try:
-            input_socket = make_zmq_socket(ctx, self.input_path, zmq.PULL)
-            to_llm_engine = make_zmq_socket(ctx, self.output_path, zmq.PUSH)
+            input_socket = make_zmq_socket(ctx, self.input_path,
+                                           zmq.constants.PULL)
+            to_llm_engine = make_zmq_socket(ctx, self.output_path,
+                                            zmq.constants.PUSH)
             to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path,
-                                             zmq.PUSH)
+                                             zmq.constants.PUSH)
             while True:
                 (msg_type, msg_bytes) = input_socket.recv_multipart()
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a7223ae3571ba..efddf3a049cf5 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -74,11 +74,13 @@ def __init__(
             to_llm_engine_path = get_open_zmq_ipc_path()
 
             # Detokenizer IPC.
-            self.ctx = zmq.Context(io_threads=2)
-            self.from_detokenizer = make_zmq_socket(
-                self.ctx, to_llm_engine_path, zmq.PULL)
-            self.to_detokenizer = make_zmq_socket(
-                self.ctx, to_detokenizer_path, zmq.PUSH)
+            self.ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
+            self.from_detokenizer = make_zmq_socket(self.ctx,
+                                                    to_llm_engine_path,
+                                                    zmq.constants.PULL)
+            self.to_detokenizer = make_zmq_socket(self.ctx,
+                                                  to_detokenizer_path,
+                                                  zmq.constants.PUSH)
 
             # Detokenizer (background process).
             self.detokenizer_client = MPDetokenizerClient(
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 6164a12cda770..78509b9cc6a08 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -250,7 +250,7 @@ def __init__(
         worker_response_mq_handle = self.worker_response_mq.export_handle()
 
         # Send Readiness signal to EngineCore process.
-        with zmq_socket_ctx(ready_path, zmq.PUSH) as ready_socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
             payload = pickle.dumps(worker_response_mq_handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
             ready_socket.send_string(WorkerProc.READY_STR)
@@ -352,7 +352,7 @@ def wait_for_startup(
         ready_path: str,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with zmq_socket_ctx(ready_path, zmq.PULL) as socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
 
             # Wait for Worker to send READY.
             while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 1544dd104c434..ce0bccfa40f66 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -84,8 +84,11 @@ def __len__(self):
         return len(self._x)
 
 
-def make_zmq_socket(ctx: Union[zmq.asyncio.Context, zmq.Context], path: str,
-                    type: Any) -> Union[zmq.Socket, zmq.asyncio.Socket]:
+def make_zmq_socket(
+    ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
+    path: str,
+    type: Any,
+) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connext semantics."""
 
     import psutil
@@ -100,13 +103,13 @@ def make_zmq_socket(ctx: Union[zmq.asyncio.Context, zmq.Context], path: str,
     else:
         buf_size = -1
 
-    if type == zmq.PULL:
-        socket.setsockopt(zmq.RCVHWM, 0)
-        socket.setsockopt(zmq.RCVBUF, buf_size)
+    if type == zmq.constants.PULL:
+        socket.setsockopt(zmq.constants.RCVHWM, 0)
+        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
         socket.bind(path)
-    elif type == zmq.PUSH:
-        socket.setsockopt(zmq.SNDHWM, 0)
-        socket.setsockopt(zmq.SNDBUF, buf_size)
+    elif type == zmq.constants.PUSH:
+        socket.setsockopt(zmq.constants.SNDHWM, 0)
+        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
         socket.connect(path)
     else:
         raise ValueError(f"Unknown Socket Type: {type}")

From 8372665b8cd1be253f0a753c62d1dcc074a3d10f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 23:25:44 +0000
Subject: [PATCH 122/132] stash

---
 vllm/entrypoints/openai/api_server.py |  2 +-
 vllm/utils.py                         |  2 +-
 vllm/v1/engine/async_llm.py           | 16 +++++++++++-----
 vllm/v1/engine/core.py                | 16 ++++++++++------
 vllm/v1/engine/detokenizer.py         | 27 ++++++++++++++-------------
 vllm/v1/engine/llm_engine.py          |  8 ++++----
 vllm/v1/utils.py                      | 12 ++++++------
 7 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 76d9a2bd714cd..4264ff22f8ab9 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -64,7 +64,7 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, set_ulimit, kill_process_tree)
+                        is_valid_ipv6_address, kill_process_tree, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
diff --git a/vllm/utils.py b/vllm/utils.py
index caed96d200bfc..5c7635f4d3e82 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -16,9 +16,9 @@
 import subprocess
 import sys
 import tempfile
-import traceback
 import threading
 import time
+import traceback
 import uuid
 import warnings
 import weakref
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 80ea0dc75234f..dbeea5f145fc2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -16,13 +16,13 @@
 # Inspired by https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/managers/tokenizer_manager.py
 
 import asyncio
-import zmq
-import zmq.asyncio
 import pickle
-
 from typing import (Any, AsyncGenerator, Dict, List, Mapping, Optional, Type,
                     Union)
 
+import zmq
+import zmq.asyncio
+
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
@@ -39,7 +39,7 @@
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import get_open_zmq_ipc_path
-from vllm.v1.engine import EngineAbortRequest
+from vllm.v1.engine import EngineAbortRequest, EngineRequestType
 from vllm.v1.engine.core import MPEngineCoreClient
 from vllm.v1.engine.detokenizer import MPDetokenizerClient
 from vllm.v1.engine.processor import Processor
@@ -238,7 +238,7 @@ async def generate(
         The output_handler() loop runs in a background task, pulling
         from Detokenizer and pushing to the per request queue.
 
-        The generate() pulls from the per request queue and yeilds
+        The generate() pulls from the per request queue and yields
         to the caller which iterates the AsyncGenerator.
         """
 
@@ -313,6 +313,12 @@ async def abort(self, request_id: str):
         if self.log_requests:
             logger.info("Aborted %s.", request_id)
 
+    async def _send_to_detokenizer(self, obj: Any):
+        """Send object to Detokenizer with a FROM_ENGINE flag."""
+
+        msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object))
+        self.to_detokenizer.send_multipart(msg, copy=False)
+
     def encode(
         self,
         prompt: PromptType,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6ce4a6621080e..e4c587f1d7eb1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,14 +1,14 @@
-import psutil
 import queue
 import signal
 import threading
 import time
+from multiprocessing.connection import Connection
 from typing import List, Tuple, Type
 
+import psutil
 import zmq
 import zmq.asyncio
 from msgspec import msgpack
-from multiprocessing.connection import Connection
 
 from vllm.config import CacheConfig, VllmConfig
 from vllm.logger import init_logger
@@ -17,14 +17,14 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import get_exception_traceback
 from vllm.v1.core.scheduler import Scheduler
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineAbortRequest, EngineRequest,
-                            EngineRequestType, EngineProfileRequest,
+from vllm.v1.engine import (EngineAbortRequest, EngineCoreOutput,
+                            EngineCoreOutputs, EngineProfileRequest,
+                            EngineRequest, EngineRequestType,
                             EngineRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.utils import zmq_socket_ctx, MPBackgroundProcess
+from vllm.v1.utils import MPBackgroundProcess, zmq_socket_ctx
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -310,3 +310,7 @@ def __init__(self, input_path: str, output_path: str,
                 "usage_context": usage_context,
             },
         )
+
+    async def profile_async(self, is_start: bool = True):
+        # TODO: enable this.
+        pass
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index c3d3e7db6a338..94d0741402efe 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,11 +1,12 @@
-import psutil
 import pickle
-import zmq.asyncio
-import msgspec
 import signal
 from dataclasses import dataclass
 from multiprocessing.connection import Connection
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import msgspec
+import psutil
+import zmq
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
@@ -15,10 +16,10 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import get_exception_traceback
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineRequestType, EngineRequest,
-                            EngineAbortRequest)
-from vllm.v1.utils import make_zmq_socket, MPBackgroundProcess
+from vllm.v1.engine import (EngineAbortRequest, EngineCoreOutput,
+                            EngineCoreOutputs, EngineRequest,
+                            EngineRequestType)
+from vllm.v1.utils import MPBackgroundProcess, make_zmq_socket
 
 logger = init_logger(__name__)
 
@@ -348,9 +349,9 @@ def signal_handler(signum, frame):
                 detokenizer = None
 
     def _handle_from_llm_engine(
-        self,
-        request_bytes: bytes,
-        to_engine_core: zmq.constants.Socket,
+            self,
+            request_bytes: bytes,
+            to_engine_core: zmq.Socket,  # type: ignore[name-defined]
     ) -> None:
         """Handle EngineRequest from the LLMEngine."""
 
@@ -369,8 +370,8 @@ def _handle_from_llm_engine(
     def _handle_from_engine_core(
         self,
         output_bytes: bytes,
-        to_engine_core: zmq.constants.Socket,
-        to_llm_engine: zmq.constants.Socket,
+        to_engine_core: zmq.Socket,  # type: ignore[name-defined]
+        to_llm_engine: zmq.Socket,  # type: ignore[name-defined]
         decoder: msgspec.msgpack.Decoder,
     ) -> None:
         """Handle Outputs from the EngineCore."""
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index efddf3a049cf5..b80a986b90433 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,8 +1,8 @@
+import pickle
 from typing import Any, Dict, List, Mapping, Optional, Type, Union
-from typing_extensions import TypeVar
 
 import zmq
-import pickle
+from typing_extensions import TypeVar
 
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
@@ -198,7 +198,7 @@ def add_request(
             # Send to Detokenizer (which forwards to EngineCore).
             # Note: we forward the message rather than sending
             # to each process separately to avoid race conditions.
-            self.send_to_detokenizer(engine_request)
+            self._send_to_detokenizer(engine_request)
         else:
             # Add directly to Detokenizer and EngineCore.
             self.detokenizer.add_request(engine_request)
@@ -221,7 +221,7 @@ def step(self) -> List[RequestOutput]:
 
             return request_outputs
 
-    def send_to_detokenizer(self, object: Any):
+    def _send_to_detokenizer(self, object: Any):
         """Send object to Detokenizer with a FROM_ENGINE flag."""
 
         msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object))
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index ce0bccfa40f66..2f8208e09bbef 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,18 +1,18 @@
 import os
 import weakref
-from dataclasses import dataclass
-from multiprocessing.process import BaseProcess
 from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import (Any, Generic, Dict, Iterator, List, Optional, TypeVar,
-                    Union, Callable, overload)
+from dataclasses import dataclass
+from multiprocessing.process import BaseProcess
+from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional,
+                    TypeVar, Union, overload)
 
 import zmq
 import zmq.asyncio
 
+from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.utils import kill_process_tree
-from vllm.executor.multiproc_worker_utils import get_mp_context
 
 logger = init_logger(__name__)
 
@@ -195,7 +195,7 @@ def wait_for_startup(
 
         # Wait for startup.
         if reader.recv()["status"] != "READY":
-            raise RuntimeError(f"{process_name} initalization failed. "
+            raise RuntimeError(f"{process_name} initialization failed. "
                                "See root cause above.")
 
         return BackgroundProcHandle(proc, input_path, output_path)

From 6b4f2bbe2ba2b9e79b8f5bd940cc73ac80b3bf2c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 23:39:10 +0000
Subject: [PATCH 123/132] almost there with llm engine

---
 vllm/v1/engine/llm_engine.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b80a986b90433..25ac92cdb4ce8 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,5 +1,5 @@
 import pickle
-from typing import Any, Dict, List, Mapping, Optional, Type, Union
+from typing import Any, Dict, List, Mapping, Optional, Set, Type, Union
 
 import zmq
 from typing_extensions import TypeVar
@@ -68,6 +68,9 @@ def __init__(
                                    mm_registry=mm_registry)
 
         if self.multiprocess_mode:
+            # Keep track of active requests.
+            self.running_requests: Set[str] = set()
+
             # IPC paths.
             to_detokenizer_path = get_open_zmq_ipc_path()
             to_engine_core_path = get_open_zmq_ipc_path()
@@ -160,10 +163,13 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         return executor_class
 
     def get_num_unfinished_requests(self) -> int:
-        return self.detokenizer.get_num_unfinished_requests()
+        if self.multiprocess_mode:
+            return len(self.running_requests)
+        else:
+            return self.detokenizer.get_num_unfinished_requests()
 
     def has_unfinished_requests(self) -> bool:
-        return self.detokenizer.has_unfinished_requests()
+        return self.get_num_unfinished_requests() > 0
 
     @classmethod
     def validate_outputs(cls, outputs, output_type):
@@ -193,8 +199,10 @@ def add_request(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
-        # Add to Detokenizer and EngineCore.
+        # Add processed input to system.
         if self.multiprocess_mode:
+            assert engine_request.request_id not in self.running_requests
+            self.running_requests.add(engine_request.request_id)
             # Send to Detokenizer (which forwards to EngineCore).
             # Note: we forward the message rather than sending
             # to each process separately to avoid race conditions.
@@ -208,7 +216,14 @@ def step(self) -> List[RequestOutput]:
 
         if self.multiprocess_mode:
             # Get next output from the Detokenizer.
-            return self.from_detokenizer.recv_pyobj()
+            request_outputs: List[
+                RequestOutput] = self.from_detokenizer.recv_pyobj()
+
+            # Removed finsihed requests from the state tracker.
+            for out in request_outputs:
+                if out.finished:
+                    self.running_requests.remove(out.request_id)
+
         else:
             # Step EngineCore and Detokenizer.
             engine_core_outputs = self.engine_core.step()
@@ -219,7 +234,7 @@ def step(self) -> List[RequestOutput]:
             if requests_to_abort:
                 self.abort_request(requests_to_abort)
 
-            return request_outputs
+        return request_outputs
 
     def _send_to_detokenizer(self, object: Any):
         """Send object to Detokenizer with a FROM_ENGINE flag."""

From db7d055b15d35cc21b1f6dae8b120b2655faab14 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 23 Dec 2024 23:41:28 +0000
Subject: [PATCH 124/132] format'

---
 vllm/v1/engine/llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 25ac92cdb4ce8..c26d6543d6728 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -219,7 +219,7 @@ def step(self) -> List[RequestOutput]:
             request_outputs: List[
                 RequestOutput] = self.from_detokenizer.recv_pyobj()
 
-            # Removed finsihed requests from the state tracker.
+            # Removed finished requests from the state tracker.
             for out in request_outputs:
                 if out.finished:
                     self.running_requests.remove(out.request_id)

From 98053d6eb3ce349da7b9bbd52ef4e626f4bbeed6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 24 Dec 2024 14:28:43 +0000
Subject: [PATCH 125/132] clean

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index dbeea5f145fc2..a1b44b19d274c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -232,7 +232,7 @@ async def generate(
         """
         Main function called by the API server to kick off a request
             * 1) Make an output queue for the Request.
-            # 2) Processing the Input (e.g. Tokenizer).
+            * 2) Processing the Input (e.g. Tokenizer, MM).
             * 3) Adding the Request to Detokenizer + EngineCore.
 
         The output_handler() loop runs in a background task, pulling

From 4713e29c2787409b7ca9472c6d2a3b9a2a674b9e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 24 Dec 2024 14:29:08 +0000
Subject: [PATCH 126/132] updated

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a1b44b19d274c..b2f5cc66b3f36 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -199,7 +199,7 @@ async def add_request(
     ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        # 1) Convert Input --> EngineRequest.
+        # 1) Convert Input --> EngineRequest (Tokenize, MM, etc).
         engine_request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             trace_headers, prompt_adapter_request, priority)

From 4f946ebc749f0bf85f8d0a0149646c117d2249f1 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 24 Dec 2024 14:30:15 +0000
Subject: [PATCH 127/132] nit

---
 vllm/v1/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 2f8208e09bbef..5dfa7470c3a47 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -89,7 +89,7 @@ def make_zmq_socket(
     path: str,
     type: Any,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
-    """Make a ZMQ socket with the proper bind/connext semantics."""
+    """Make a ZMQ socket with the proper bind/connect semantics."""
 
     import psutil
     mem = psutil.virtual_memory()

From 59c64300d792cd3b42641c0172a9a03073beccd1 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Tue, 24 Dec 2024 09:31:22 -0500
Subject: [PATCH 128/132] Update vllm/v1/utils.py

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/v1/utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 5dfa7470c3a47..49b0cf19fd851 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -96,12 +96,17 @@ def make_zmq_socket(
 
     socket = ctx.socket(type)
 
+    # Calculate buffer size based on system memory
     total_mem = mem.total / 1024**3
     available_mem = mem.available / 1024**3
+    # For systems with substantial memory (>32GB total, >16GB available):
+    # - Set a large 0.5GB buffer to improve throughput
+    # For systems with less memory:
+    # - Use system default (-1) to avoid excessive memory consumption
     if total_mem > 32 and available_mem > 16:
-        buf_size = int(0.5 * 1024**3)
+        buf_size = int(0.5 * 1024**3)  # 0.5GB in bytes
     else:
-        buf_size = -1
+        buf_size = -1  # Use system default buffer size
 
     if type == zmq.constants.PULL:
         socket.setsockopt(zmq.constants.RCVHWM, 0)

From 856838d1838575a149df042b6d544da3b2715e0a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 24 Dec 2024 14:37:09 +0000
Subject: [PATCH 129/132] updated

---
 vllm/v1/engine/core.py        | 3 +++
 vllm/v1/engine/detokenizer.py | 3 +++
 vllm/v1/engine/llm_engine.py  | 5 ++++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e4c587f1d7eb1..ace4bb8bfed1d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -210,7 +210,10 @@ def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
         # Loop until process is sent a SIGINT or SIGTERM
+        i = 0
         while True:
+            print(f"EPOCH: {i}")
+            i += 1
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 94d0741402efe..70ef3d018c2c6 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -402,7 +402,10 @@ def run_busy_loop(self):
                                             zmq.constants.PUSH)
             to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path,
                                              zmq.constants.PUSH)
+            i = 0
             while True:
+                print(f"EPOCH: {i}")
+                i += 1
                 (msg_type, msg_bytes) = input_socket.recv_multipart()
 
                 # Handle message from LLMEngine (Abort or New Request).
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 3e707af7bc10e..2b4a0ec855124 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -25,8 +25,8 @@
 from vllm.v1.engine.detokenizer import Detokenizer, MPDetokenizerClient
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.utils import make_zmq_socket
 from vllm.v1.executor.ray_utils import initialize_ray_cluster
+from vllm.v1.utils import make_zmq_socket
 
 logger = init_logger(__name__)
 
@@ -199,6 +199,8 @@ def add_request(
         priority: int = 0,
     ) -> None:
 
+        logger.info("Added request.")
+
         # Process raw inputs into the request.
         engine_request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
@@ -219,6 +221,7 @@ def add_request(
 
     def step(self) -> List[RequestOutput]:
 
+        logger.info("Called step.")
         if self.multiprocess_mode:
             # Get next output from the Detokenizer.
             request_outputs: List[

From 94fe4afbb397cd621dbd5525f8756ef3fe478528 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 24 Dec 2024 14:50:41 +0000
Subject: [PATCH 130/132] updated

---
 vllm/v1/engine/async_llm.py   | 6 +++---
 vllm/v1/engine/core.py        | 3 ---
 vllm/v1/engine/detokenizer.py | 4 +---
 vllm/v1/engine/llm_engine.py  | 2 --
 4 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b2f5cc66b3f36..cb87985449b9e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -210,7 +210,7 @@ async def add_request(
         # 3) Send to Detokenizer (which forwards to EngineCore).
         # Note: we forward the request rather than sending to each
         # process separately to avoid race conditions in Detokenizer).
-        await self.send_to_detokenizer(engine_request)
+        await self._send_to_detokenizer(engine_request)
 
         return self.rid_to_queue[request_id]
 
@@ -304,7 +304,7 @@ async def abort(self, request_id: str):
         """Abort request if the client cancels the request."""
 
         # Send abort to Detokenizer (which will fwd to EngineCore).
-        await self.send_to_detokenizer(EngineAbortRequest([request_id]))
+        await self._send_to_detokenizer(EngineAbortRequest([request_id]))
 
         # Remove from request output queues.
         if request_id in self.rid_to_queue:
@@ -317,7 +317,7 @@ async def _send_to_detokenizer(self, obj: Any):
         """Send object to Detokenizer with a FROM_ENGINE flag."""
 
         msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object))
-        self.to_detokenizer.send_multipart(msg, copy=False)
+        await self.to_detokenizer.send_multipart(msg, copy=False)
 
     def encode(
         self,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ace4bb8bfed1d..e4c587f1d7eb1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -210,10 +210,7 @@ def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
         # Loop until process is sent a SIGINT or SIGTERM
-        i = 0
         while True:
-            print(f"EPOCH: {i}")
-            i += 1
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 70ef3d018c2c6..2d8724e687448 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -402,10 +402,8 @@ def run_busy_loop(self):
                                             zmq.constants.PUSH)
             to_engine_core = make_zmq_socket(ctx, self.to_engine_core_path,
                                              zmq.constants.PUSH)
-            i = 0
+
             while True:
-                print(f"EPOCH: {i}")
-                i += 1
                 (msg_type, msg_bytes) = input_socket.recv_multipart()
 
                 # Handle message from LLMEngine (Abort or New Request).
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 2b4a0ec855124..149373023c8f2 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -220,8 +220,6 @@ def add_request(
             self.engine_core.add_request(engine_request)
 
     def step(self) -> List[RequestOutput]:
-
-        logger.info("Called step.")
         if self.multiprocess_mode:
             # Get next output from the Detokenizer.
             request_outputs: List[

From 127045a682024d0ca6ab09a70a8f00987fdcd8b9 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 24 Dec 2024 14:59:57 +0000
Subject: [PATCH 131/132] stash

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index cb87985449b9e..d82f278e36744 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -316,7 +316,7 @@ async def abort(self, request_id: str):
     async def _send_to_detokenizer(self, obj: Any):
         """Send object to Detokenizer with a FROM_ENGINE flag."""
 
-        msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(object))
+        msg = (EngineRequestType.FROM_ENGINE.value, pickle.dumps(obj))
         await self.to_detokenizer.send_multipart(msg, copy=False)
 
     def encode(

From 1352386fad6ac73bd62611e86cc8c5d9411968bf Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 24 Dec 2024 15:03:38 +0000
Subject: [PATCH 132/132] remove log

---
 vllm/v1/engine/llm_engine.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 149373023c8f2..536fdb28717b4 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -199,8 +199,6 @@ def add_request(
         priority: int = 0,
     ) -> None:
 
-        logger.info("Added request.")
-
         # Process raw inputs into the request.
         engine_request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,