vllm-project · njhill · Feb 24, 2025 · Feb 9, 2025 · Feb 9, 2025 · Feb 9, 2025
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
@@ -250,6 +250,65 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
     assert "".join(chunks) == single_output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
+                                     model_name: str):
+    """Parallel sampling without streaming.
+    A single request output contains a list of completions.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    completion = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=max_tokens,
+                                                 n=n,
+                                                 stream=False)
+
+    for choice in completion.choices:
+        assert choice.finish_reason is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             stream=True)
+    chunks: List[List[str]] = [[] for i in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == n
+    for chunk in chunks:
+        assert len(chunk) == max_tokens
+        print("".join(chunk))
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",

@@ -24,6 +24,8 @@
 from vllm.utils import cdiv, kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.parallel_sampling import (ParallelSamplingOutputProcessor,
+                                              ParentRequestState)
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
@@ -50,6 +52,8 @@ def __init__(
         assert start_engine_loop
 
         self.model_config = vllm_config.model_config
+        self.enable_prefix_caching = (
+            vllm_config.cache_config.enable_prefix_caching)
 
         self.log_requests = log_requests
         self.log_stats = log_stats
@@ -167,7 +171,7 @@ async def add_request(
     # requests we don't need to send multiple messages to core proc,
     # and so we don't need multiple streams which then get
     # re-multiplexed in the API server anyhow.
-    async def generate(
+    async def _generate(
         self,
         prompt: PromptType,
         sampling_params: SamplingParams,
@@ -238,6 +242,121 @@ async def generate(
             await self.abort(request_id)
             raise
 
+    async def _parallel_sampling_task(
+        self,
+        gen: AsyncGenerator[RequestOutput, None],
+        output_processor: ParallelSamplingOutputProcessor,
+        index: int,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        async for out in gen:
+            if req_out := output_processor.process_output(out, index):
+                yield req_out
+
+    async def _parallel_sampling_batch(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        parent_state = ParentRequestState(request_id, sampling_params)
+        output_processor = ParallelSamplingOutputProcessor(parent_state)
+        n = parent_state.n
+
+        if self.enable_prefix_caching:
+            # If engine uses APC, generate a “warmup request” with
+            # max_tokens=1 which populates the APC
+            w_sampling_params = parent_state.get_child_sampling_params({
+                "max_tokens":
+                1,
+                "n":
+                1,
+                "output_kind":
+                RequestOutputKind.FINAL_ONLY
+            })
+            async for _ in self._generate(
+                    prompt,
+                    w_sampling_params,
+                    parent_state.get_warmup_request_id(),
+                    lora_request,
+                    trace_headers,
+                    prompt_adapter_request,
+                    priority,
+            ):
+                # Exhaust the generator
+                pass
+
+        # Aggregate generators for n child requests
+        gens = []
+        active = {}
+        seed = sampling_params.seed
+        for idx in range(n):
+            c_sampling_params = parent_state.get_child_sampling_params({
+                "n":
+                1,
+                "seed":
+                seed
+            })
+            if seed is not None:
+                seed += 1
+            child_gen = self._generate(
+                prompt,
+                c_sampling_params,
+                parent_state.get_child_request_id(idx),
+                lora_request,
+                trace_headers,
+                prompt_adapter_request,
+                priority,
+            )
+            gen = self._parallel_sampling_task(child_gen, output_processor,
+                                               idx)
+            gens.append(gen)
+            active[asyncio.create_task(gen.__anext__())] = idx
+
+        try:
+            while active:
+                done, _ = await asyncio.wait(
+                    active.keys(), return_when=asyncio.FIRST_COMPLETED)
+                for task in done:
+                    idx = active.pop(task)
+                    try:
+                        result = task.result()
+                        yield result
+                        # Schedule the next result
+                        active[asyncio.create_task(
+                            gens[idx].__anext__())] = idx
+                    except StopAsyncIteration:
+                        continue
+        finally:
+            for task in active:
+                task.cancel()
+
+    async def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        n = sampling_params.n
+        if n is None or sampling_params.n == 1:
+            async for out in self._generate(prompt, sampling_params,
+                                            request_id, lora_request,
+                                            trace_headers,
+                                            prompt_adapter_request, priority):
+                yield out
+        else:
+            async for out in self._parallel_sampling_batch(
+                    prompt, sampling_params, request_id, lora_request,
+                    trace_headers, prompt_adapter_request, priority):
+                yield out
+
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 

@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from copy import copy
+from typing import Any, Dict, Optional
+
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+
+
+class ParentRequestState:
+    request_id: str
+    sampling_params: SamplingParams
+    request_output: Optional[RequestOutput] = None
+
+    def __init__(self, request_id: str,
+                 sampling_params: SamplingParams) -> None:
+        self.request_id = request_id
+        self.sampling_params = sampling_params
+
+    def get_child_sampling_params(
+        self,
+        kwargs: Optional[Dict[str, Any]] = None,
+    ) -> SamplingParams:
+        sampling_params = copy(self.sampling_params)
+        if kwargs is not None:
+            for kw in kwargs:
+                setattr(sampling_params, kw, kwargs[kw])
+        return sampling_params
+
+    def add_output(
+        self,
+        child_req_output: RequestOutput,
+    ) -> None:
+        if self.request_output is None:
+            # Save the first request output; reinstate
+            # original request ID; metrics are not
+            # supported for parallel sampling
+            child_req_output.request_id = self.request_id
+            child_req_output.metrics = None
+            self.request_output = child_req_output
+        else:
+            # Add completion to the request output
+            new_completion = child_req_output.outputs[0]
+            new_completion.index = self.num_completions
+            self.request_output.outputs.append(new_completion)
+
+    def get_warmup_request_id(self) -> str:
+        return "w_" + self.request_id
+
+    def get_child_request_id(
+        self,
+        index: int,
+    ) -> str:
+        return str(index) + "_" + self.request_id
+
+    @property
+    def num_completions(self) -> int:
+        assert self.request_output is not None
+        return len(self.request_output.outputs)
+
+    @property
+    def n(self) -> int:
+        return self.sampling_params.n
+
+    @property
+    def logprobs(self) -> Optional[int]:
+        return self.sampling_params.logprobs
+
+    @property
+    def prompt_logprobs(self) -> Optional[int]:
+        return self.sampling_params.prompt_logprobs
+
+    @property
+    def output_kind(self) -> RequestOutputKind:
+        return self.sampling_params.output_kind
+
+
+class ParallelSamplingOutputProcessor:
+
+    def __init__(
+        self,
+        parent_state: ParentRequestState,
+    ) -> None:
+        self.parent_state = parent_state
+
+    def process_output(
+        self,
+        child_req_output: RequestOutput,
+        index: int,
+    ) -> Optional[RequestOutput]:
+        if self.parent_state.output_kind == RequestOutputKind.FINAL_ONLY:
+            # stream=false: accumulate child completions
+            self.parent_state.add_output(child_req_output)
+            if self.parent_state.num_completions == self.parent_state.n:
+                # Return accumulated request output after obtaining
+                # all completions
+                return self.parent_state.request_output
+        else:
+            # stream=true: return child completions immediately
+            child_req_output.request_id = self.parent_state.request_id
+            child_req_output.outputs[0].index = index
+            return child_req_output
+
+        return None