From eb2428e85c0ba223f28c1792d79a0d1be300df1f Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Sat, 13 Apr 2024 01:56:57 +0900
Subject: [PATCH 001/106] [Test] Test multiple attn backend for chunked
 prefill.  (#4023)

---
 .buildkite/test-pipeline.yaml                  |  8 +++++++-
 .../test_basic_correctness.py                  |  6 ------
 .../basic_correctness/test_chunked_prefill.py  |  4 ----
 vllm/attention/backends/rocm_flash_attn.py     | 18 ++++++------------
 4 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 695290ed74ab5..8d7d6304cf12e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -12,7 +12,13 @@ steps:
   command: pytest -v -s async_engine
 
 - label: Basic Correctness Test
-  command: pytest -v -s basic_correctness
+  commands:
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test
   command: pytest -v -s core
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index bd4c7ea3301be..97cff623c5e1d 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -4,8 +4,6 @@
 """
 import pytest
 
-from vllm.attention.selector import VLLM_ATTENTION_BACKEND
-
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
@@ -16,7 +14,6 @@
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False, True])
-@pytest.mark.parametrize("attn_backend", ["XFORMERS", "FLASH_ATTN"])
 def test_models(
     hf_runner,
     vllm_runner,
@@ -25,10 +22,7 @@ def test_models(
     dtype: str,
     max_tokens: int,
     enforce_eager: bool,
-    attn_backend: str,
-    monkeypatch,
 ) -> None:
-    monkeypatch.setenv(VLLM_ATTENTION_BACKEND, attn_backend)
     hf_model = hf_runner(model, dtype=dtype)
     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
     del hf_model
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 293f5539015c0..b021377f229e5 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -34,10 +34,6 @@ def test_models(
     enforce_eager: bool,
     tensor_parallel_size: int,
 ) -> None:
-    if (tensor_parallel_size == 2 and chunked_prefill_token_size != 16
-            and not enforce_eager):
-        pytest.skip(f"Skip {chunked_prefill_token_size=} and {enforce_eager=} "
-                    "for high TP to save testing time.")
     max_num_seqs = min(chunked_prefill_token_size, 256)
     enable_chunked_prefill = False
     max_num_batched_tokens = None
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index e55435cd2c947..c42660fb8f74f 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -162,7 +162,7 @@ def __init__(
             # AMD Radeon 7900 series (gfx1100) currently does not support
             # xFormers nor FlashAttention. As a temporary workaround, we use
             # naive PyTorch implementation of attention.
-            self.attn_fuc = _naive_attention()
+            self.attn_fuc = _naive_attention
             logger.debug("Using naive attention in ROCmBackend")
         elif self.use_triton_flash_attn:
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
@@ -334,26 +334,21 @@ def _naive_attention(
     prompt_lens: List[int],
     scale: float,
 ) -> torch.Tensor:
-    num_tokens = query.shape[0]
     output = torch.empty_like(query)
     start = 0
     for _, prompt_len in enumerate(prompt_lens):
         end = start + prompt_len
         out = _naive_masked_attention(
-            query[None, start:end],
-            key[None, start:end],
-            value[None, start:end],
+            query[start:end],
+            key[start:end],
+            value[start:end],
             scale,
         )
         # TODO(woosuk): Unnecessary copy. Optimize.
         output[start:end].copy_(out)
         start += prompt_len
 
-    # Using view got RuntimeError: view size is not compatible
-    # with input tensor's size and stride (at least one
-    # dimension spans across two contiguous subspaces).
-    # Use reshape instead.
-    return output.reshape(num_tokens, -1)
+    return output
 
 
 def _naive_masked_attention(
@@ -362,14 +357,13 @@ def _naive_masked_attention(
     value: torch.Tensor,
     scale: float,
 ) -> torch.Tensor:
-    seq_len, _, _ = query.shape
+    seq_len, head_size, head_dim = query.shape
     attn_mask = torch.triu(torch.ones(seq_len,
                                       seq_len,
                                       dtype=query.dtype,
                                       device=query.device),
                            diagonal=1)
     attn_mask = attn_mask * torch.finfo(query.dtype).min
-
     attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
     attn_weights = attn_weights + attn_mask.float()
     attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)

From 71760ce22d7218d5e1a278a5c36344cfc4161aa8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Apr 2024 12:35:44 -0700
Subject: [PATCH 002/106] [Bugfix] fix type hint for py 3.8 (#4036)

---
 vllm/executor/executor_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index c18edd75d7a4d..55bccfa8e3ca9 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
@@ -31,7 +31,7 @@ def __init__(
         raise NotImplementedError
 
     @abstractmethod
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available blocks for the GPU KV cache and
         swappable CPU KV cache.
 

From 405a695a6825d5262cbd482b187066883322827b Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Fri, 12 Apr 2024 13:56:04 -0700
Subject: [PATCH 003/106] [Misc] Fix typo in scheduler.py (#4022)

---
 vllm/core/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 411ef494fd93d..4aa17adb602d2 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -674,7 +674,7 @@ def _schedule_prefills(
     def _schedule_default(self) -> SchedulerOutputs:
         """Schedule queued requests.
         
-        The current policy is designed to opimimize the throughput. First,
+        The current policy is designed to optimize the throughput. First,
         it batches as many prefill requests as possible. And it schedules
         decodes. If there's a pressure on GPU memory, decode requests can
         be swapped or preempted.

From 801ad22ea42a88df3ac3105c8ee569c7f6c38e6c Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Sat, 13 Apr 2024 06:35:50 +0900
Subject: [PATCH 004/106] [mypy] Add mypy type annotation part 1 (#4006)

---
 .github/workflows/mypy.yaml            | 50 ++++++++++++++++++++++++++
 format.sh                              | 22 +++++++++---
 pyproject.toml                         |  5 ++-
 requirements-common.txt                |  3 +-
 requirements-dev.txt                   |  2 +-
 vllm/config.py                         |  9 +++--
 vllm/core/block_manager_v1.py          | 12 ++++---
 vllm/core/block_manager_v2.py          |  4 ++-
 vllm/core/interfaces.py                |  4 ++-
 vllm/core/scheduler.py                 | 25 +++++++------
 vllm/distributed/communication_op.py   | 10 +++---
 vllm/engine/ray_utils.py               | 18 ++++++----
 vllm/entrypoints/api_server.py         |  1 +
 vllm/entrypoints/llm.py                |  8 +++--
 vllm/executor/cpu_executor.py          |  4 +--
 vllm/executor/gpu_executor.py          |  4 +--
 vllm/executor/neuron_executor.py       |  4 +--
 vllm/executor/ray_gpu_executor.py      | 11 +++---
 vllm/sampling_params.py                |  5 +--
 vllm/sequence.py                       |  8 ++---
 vllm/transformers_utils/config.py      |  3 +-
 vllm/transformers_utils/detokenizer.py |  7 ++--
 vllm/transformers_utils/tokenizer.py   |  4 +--
 vllm/usage/usage_lib.py                |  8 ++---
 vllm/utils.py                          | 12 ++++---
 25 files changed, 171 insertions(+), 72 deletions(-)
 create mode 100644 .github/workflows/mypy.yaml

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
new file mode 100644
index 0000000000000..fbe0f816fd4af
--- /dev/null
+++ b/.github/workflows/mypy.yaml
@@ -0,0 +1,50 @@
+name: mypy
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install mypy==1.9.0
+        pip install types-setuptools
+        pip install types-PyYAML
+        pip install types-requests
+        pip install types-setuptools
+    - name: Mypy
+      run: |
+        mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
+
+        # TODO(sang): Follow up
+        # mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
+        # mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
+        # mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
+        # mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
+        # mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
+
diff --git a/format.sh b/format.sh
index deb57b2b049d1..1c195b899c742 100755
--- a/format.sh
+++ b/format.sh
@@ -93,9 +93,23 @@ fi
 echo 'vLLM yapf: Done'
 
 # Run mypy
-# TODO(zhuohan): Enable mypy
-# echo 'vLLM mypy:'
-# mypy
+echo 'vLLM mypy:'
+mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
+
+# TODO(sang): Follow up
+# mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
+# mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
+# mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
+# mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
+# mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
+
 
 CODESPELL_EXCLUDES=(
     '--skip' '*docs/source/_build/**'
@@ -228,5 +242,3 @@ if ! git diff --quiet &>/dev/null; then
 
     exit 1
 fi
-
-
diff --git a/pyproject.toml b/pyproject.toml
index 607c09935db89..805170719e50a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,10 +45,13 @@ ignore = [
 python_version = "3.8"
 
 ignore_missing_imports = true
+ check_untyped_defs = true
 
 files = "vllm"
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
-exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
+exclude = [
+    "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+]
 
 
 [tool.codespell]
diff --git a/requirements-common.txt b/requirements-common.txt
index ff053388a23e1..c96f9c9937fb0 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -11,4 +11,5 @@ uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 tiktoken == 0.6.0  # Required for DBRX tokenizer
-outlines == 0.0.34 # Requires torch >= 2.1.0
\ No newline at end of file
+outlines == 0.0.34 # Requires torch >= 2.1.0
+typing_extensions
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 75d22bbdb2a1b..96dfda6faf00f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -7,7 +7,7 @@ codespell==2.2.6
 isort==5.13.2
 
 # type checking
-mypy==0.991
+mypy==1.9.0
 types-PyYAML
 types-requests
 types-setuptools
diff --git a/vllm/config.py b/vllm/config.py
index 744fecdc7c64f..ca3f004d0dd4c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2,7 +2,7 @@
 import json
 import os
 from dataclasses import dataclass, fields
-from typing import TYPE_CHECKING, ClassVar, Optional, Union
+from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
 
 import torch
 from packaging.version import Version
@@ -147,7 +147,7 @@ def _verify_load_format(self) -> None:
         supported_load_format = [
             "auto", "pt", "safetensors", "npcache", "dummy"
         ]
-        rocm_not_supported_load_format = []
+        rocm_not_supported_load_format: List[str] = []
         if load_format not in supported_load_format:
             raise ValueError(
                 f"Unknown load format: {self.load_format}. Must be one of "
@@ -719,6 +719,9 @@ def maybe_create_spec_config(
                 "num_speculative_tokens to be provided, but found "
                 f"{speculative_model=} and {num_speculative_tokens=}.")
 
+        assert (speculative_model is not None
+                and num_speculative_tokens is not None)
+
         # TODO: The user should be able to specify revision/quantization/max
         # model len for the draft model. It is not currently supported.
         draft_revision = None
@@ -1033,7 +1036,7 @@ def _get_and_verify_max_len(
         derived_max_model_len *= scaling_factor
 
     if max_model_len is None:
-        max_model_len = derived_max_model_len
+        max_model_len = int(derived_max_model_len)
     elif max_model_len > derived_max_model_len:
         # Some models might have a separate key for specifying model_max_length
         # that will be bigger than derived_max_model_len. We compare user input
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index e7e3b4dc1e9b4..e391a3b1e5a33 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -1,5 +1,6 @@
 """A block manager that manages token blocks."""
 from abc import ABC, abstractmethod
+from collections.abc import Sequence as GenericSequence
 from itertools import count, takewhile
 from os.path import commonprefix
 from typing import Dict, List, Optional, Set
@@ -231,10 +232,10 @@ def __init__(
 
         if self.enable_caching:
             logger.info("Automatic prefix caching is enabled.")
-            self.gpu_allocator = CachedBlockAllocator(Device.GPU, block_size,
-                                                      num_gpu_blocks)
-            self.cpu_allocator = CachedBlockAllocator(Device.CPU, block_size,
-                                                      num_cpu_blocks)
+            self.gpu_allocator: BlockAllocatorBase = CachedBlockAllocator(
+                Device.GPU, block_size, num_gpu_blocks)
+            self.cpu_allocator: BlockAllocatorBase = CachedBlockAllocator(
+                Device.CPU, block_size, num_cpu_blocks)
         else:
             self.gpu_allocator = UncachedBlockAllocator(
                 Device.GPU, block_size, num_gpu_blocks)
@@ -588,7 +589,8 @@ def get_all_computed_blocks(self, seq: Sequence) -> List[int]:
             for b in takewhile(lambda b: b.computed, block_table[:-1])
         ]
 
-    def get_common_computed_block_ids(self, seqs: List[Sequence]) -> List[int]:
+    def get_common_computed_block_ids(
+            self, seqs: List[Sequence]) -> GenericSequence[int]:
         """Return the block ids that are common for a given sequence group.
 
         Used in prefill (can skip prefill of some blocks).
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 813e71ad883b2..19f0cf415eb34 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -1,4 +1,5 @@
 """A block manager that manages token blocks."""
+from collections.abc import Sequence as GenericSequence
 from typing import Dict, List, Optional
 
 from vllm.core.block.block_table import BlockTable
@@ -205,7 +206,8 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         # as computed.
         self.block_allocator.mark_blocks_as_computed()
 
-    def get_common_computed_block_ids(self, seqs: List[Sequence]) -> List[int]:
+    def get_common_computed_block_ids(
+            self, seqs: List[Sequence]) -> GenericSequence[int]:
         """Determine which blocks for which we skip prefill.
 
         With prefix caching we can skip prefill for previously-generated blocks.
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 711536bcc97be..c1f68a2e891bf 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,5 +1,6 @@
 import enum
 from abc import ABC, abstractmethod
+from collections.abc import Sequence as GenericSequence
 from typing import Dict, List
 
 from vllm.sequence import Sequence, SequenceGroup
@@ -103,7 +104,8 @@ def access_all_blocks_in_seq(
         pass
 
     @abstractmethod
-    def get_common_computed_block_ids(self, seqs: List[Sequence]) -> List[int]:
+    def get_common_computed_block_ids(
+            self, seqs: List[Sequence]) -> GenericSequence[int]:
         pass
 
     @abstractmethod
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 4aa17adb602d2..4da7600387f05 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -42,8 +42,8 @@ class SchedulingBudget:
     """
     token_budget: int
     max_num_seqs: int
-    _requeset_ids_num_batched_tokens: Set[int] = field(default_factory=set)
-    _requeset_ids_num_curr_seqs: Set[int] = field(default_factory=set)
+    _requeset_ids_num_batched_tokens: Set[str] = field(default_factory=set)
+    _requeset_ids_num_curr_seqs: Set[str] = field(default_factory=set)
     _num_batched_tokens: int = 0
     _num_curr_seqs: int = 0
 
@@ -133,7 +133,7 @@ def is_empty(self) -> bool:
         return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
                 and not self.blocks_to_swap_out and not self.blocks_to_copy)
 
-    def _sort_by_lora_ids(self) -> bool:
+    def _sort_by_lora_ids(self):
         self.scheduled_seq_groups = sorted(
             self.scheduled_seq_groups,
             key=lambda g: (g.seq_group.lora_int_id, g.seq_group.request_id))
@@ -337,7 +337,8 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
                     self.free_seq(seq)
 
     def has_unfinished_seqs(self) -> bool:
-        return self.waiting or self.running or self.swapped
+        return len(self.waiting) != 0 or len(self.running) != 0 or len(
+            self.swapped) != 0
 
     def get_num_unfinished_seq_groups(self) -> int:
         return len(self.waiting) + len(self.running) + len(self.swapped)
@@ -404,7 +405,7 @@ def _schedule_running(
                 budget.subtract_num_seqs(seq_group.request_id,
                                          num_running_seqs)
                 if curr_loras is not None and seq_group.lora_int_id > 0:
-                    curr_loras.pop(seq_group.lora_int_id)
+                    curr_loras.remove(seq_group.lora_int_id)
 
                 if running_queue:
                     # Preempt the lowest-priority sequence groups.
@@ -496,7 +497,7 @@ def _schedule_swapped(
         now = time.time()
         swapped_queue = policy.sort_by_priority(now, swapped_queue)
 
-        leftover_swapped = deque()
+        leftover_swapped: Deque[SequenceGroup] = deque()
         while swapped_queue:
             seq_group = swapped_queue[0]
 
@@ -507,7 +508,9 @@ def _schedule_swapped(
             lora_int_id = 0
             if self.lora_enabled:
                 lora_int_id = seq_group.lora_int_id
-                if (lora_int_id > 0 and lora_int_id not in curr_loras
+                assert curr_loras is not None
+                assert self.lora_config is not None
+                if (lora_int_id > 0 and (lora_int_id not in curr_loras)
                         and len(curr_loras) >= self.lora_config.max_loras):
                     # We don't have a space for another LoRA, so
                     # we ignore this request for now.
@@ -593,7 +596,7 @@ def _schedule_prefills(
         # Copy the queue so that the input queue is not modified.
         waiting_queue = deque([s for s in waiting_queue])
 
-        leftover_waiting_sequences = deque()
+        leftover_waiting_sequences: Deque[SequenceGroup] = deque()
         while self._passed_delay(time.time()) and waiting_queue:
             seq_group = waiting_queue[0]
 
@@ -635,6 +638,8 @@ def _schedule_prefills(
             lora_int_id = 0
             if self.lora_enabled:
                 lora_int_id = seq_group.lora_int_id
+                assert curr_loras is not None
+                assert self.lora_config is not None
                 if (self.lora_enabled and lora_int_id > 0
                         and lora_int_id not in curr_loras
                         and len(curr_loras) >= self.lora_config.max_loras):
@@ -780,7 +785,7 @@ def _schedule_chunked_prefill(self):
             token_budget=self.scheduler_config.max_num_batched_tokens,
             max_num_seqs=self.scheduler_config.max_num_seqs,
         )
-        curr_loras = set()
+        curr_loras: Set[int] = set()
 
         remaining_waiting, prefills = (self.waiting,
                                        SchedulerPrefillOutputs.create_empty())
@@ -1108,7 +1113,7 @@ def _get_num_lookahead_slots(self, is_prefill: bool) -> int:
 
     def _get_num_new_tokens(self, seq_group: SequenceGroup,
                             status: SequenceStatus, enable_chunking: bool,
-                            budget: SchedulingBudget) -> Tuple[int, bool]:
+                            budget: SchedulingBudget) -> int:
         """Get the next new tokens to compute for a given sequence group
             that's in a given `status`.
 
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 1004d626b6a4b..a3e93691a1e8e 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,5 +1,5 @@
 from collections import namedtuple
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.distributed import ProcessGroup
@@ -144,7 +144,7 @@ def broadcast_tensor_dict(
     tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
     src: int = 0,
     group: Optional[ProcessGroup] = None,
-) -> Dict[Any, Union[torch.Tensor, Any]]:
+) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
     """Broadcast the input tensor dictionary."""
     group = group or torch.distributed.group.WORLD
     ranks = torch.distributed.get_process_group_ranks(group)
@@ -157,10 +157,10 @@ def broadcast_tensor_dict(
 
     rank = torch.distributed.get_rank()
     if rank == src:
+        metadata_list: List[Tuple[Any, Any]] = []
         assert isinstance(
             tensor_dict,
             dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
-        metadata_list = []
         for key, value in tensor_dict.items():
             if isinstance(value, torch.Tensor):
                 assert value.is_cuda, (
@@ -190,10 +190,10 @@ def broadcast_tensor_dict(
         torch.distributed.broadcast_object_list(recv_metadata_list,
                                                 src=src,
                                                 group=group)
-        metadata_list = recv_metadata_list[0]
+        assert recv_metadata_list[0] is not None
         tensor_dict = {}
         async_handles = []
-        for key, value in metadata_list:
+        for key, value in recv_metadata_list[0]:
             if isinstance(value, TensorMetadata):
                 tensor = torch.empty(value.size,
                                      dtype=value.dtype,
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index 70d5c9b1fae05..04d4ed83976d0 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -1,9 +1,10 @@
 import pickle
-from typing import List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
 from vllm.utils import get_ip, is_hip, set_cuda_visible_devices
+from vllm.worker.worker import Worker
 
 logger = init_logger(__name__)
 
@@ -18,15 +19,20 @@ def __init__(self, init_cached_hf_modules=False) -> None:
             if init_cached_hf_modules:
                 from transformers.dynamic_module_utils import init_hf_modules
                 init_hf_modules()
-            self.worker = None
+            self._worker: Optional[Worker] = None
             # Since the compiled DAG runs a main execution
             # in a different thread that calls cuda.set_device.
             # The flag indicates is set_device is called on
             # that thread.
             self.compiled_dag_cuda_device_set = False
 
-        def init_worker(self, worker_init_fn):
-            self.worker = worker_init_fn()
+        def init_worker(self, worker_init_fn: Callable[[], Worker]):
+            self._worker = worker_init_fn()
+
+        @property
+        def worker(self) -> Worker:
+            assert self._worker is not None
+            return self._worker
 
         def __getattr__(self, name):
             return getattr(self.worker, name)
@@ -70,8 +76,8 @@ def execute_model_compiled_dag_remote(self, ignored):
     logger.warning(f"Failed to import Ray with {e!r}. "
                    "For distributed inference, please install Ray with "
                    "`pip install ray`.")
-    ray = None
-    RayWorkerVllm = None
+    ray = None  # type: ignore
+    RayWorkerVllm = None  # type: ignore
 
 
 def initialize_ray_cluster(
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 2a47eae112c12..587142adb9c6b 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -47,6 +47,7 @@ async def generate(request: Request) -> Response:
     sampling_params = SamplingParams(**request_dict)
     request_id = random_uuid()
 
+    assert engine is not None
     results_generator = engine.generate(prompt, sampling_params, request_id)
 
     # Streaming case
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index c5261d6e4556c..0694d3b7cdb91 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -179,8 +179,12 @@ def generate(
             multi_modal_data.data = multi_modal_data.data.to(torch.float16)
 
         # Add requests to the engine.
-        num_requests = len(prompts) if prompts is not None else len(
-            prompt_token_ids)
+        if prompts is not None:
+            num_requests = len(prompts)
+        else:
+            assert prompt_token_ids is not None
+            num_requests = len(prompt_token_ids)
+
         for i in range(num_requests):
             prompt = prompts[i] if prompts is not None else None
             token_ids = None if prompt_token_ids is None else prompt_token_ids[
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index eda4e8989c163..33e67d8b3eec2 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import torch
 
@@ -61,7 +61,7 @@ def _init_worker(self):
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
         underlying worker.
         """
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 80ca5cb7367c5..f20221a0b941a 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
@@ -66,7 +66,7 @@ def _init_worker(self):
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
         underlying worker.
         """
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 57436a85cfa27..ee8e87432fa67 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
@@ -47,7 +47,7 @@ def _init_worker(self):
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
         underlying worker.
         """
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 6c0ccd7e64c90..b937693c92257 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -3,7 +3,7 @@
 import os
 import pickle
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
@@ -197,7 +197,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             max_parallel_loading_workers,
         )
 
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks.
 
         This invokes `determine_num_available_blocks` on each worker and takes
@@ -205,7 +205,7 @@ def determine_num_available_blocks(self) -> tuple[int, int]:
         compatible with all workers.
 
         Returns:
-            - tuple[num_gpu_blocks, num_cpu_blocks]
+            - Tuple[num_gpu_blocks, num_cpu_blocks]
         """
         # Get the maximum number of blocks that can be allocated on GPU and CPU.
         num_blocks = self._run_workers("determine_num_available_blocks", )
@@ -276,7 +276,7 @@ def _run_workers(
         self,
         method: str,
         *args,
-        driver_args: Optional[List[Any]] = None,
+        driver_args: Optional[Tuple[Any, ...]] = None,
         driver_kwargs: Optional[Dict[str, Any]] = None,
         max_concurrent_workers: Optional[int] = None,
         use_ray_compiled_dag: bool = False,
@@ -291,6 +291,7 @@ def _run_workers(
         if use_ray_compiled_dag:
             # Right now, compiled DAG can only accept a single
             # input. TODO(sang): Fix it.
+            assert self.forward_dag is not None
             output_channels = self.forward_dag.execute(1)
         else:
             # Start the ray workers first.
@@ -369,7 +370,7 @@ async def _run_workers_async(
         self,
         method: str,
         *args,
-        driver_args: Optional[List[Any]] = None,
+        driver_args: Optional[Tuple[Any, ...]] = None,
         driver_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,
     ) -> Any:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 0b9787608798c..53a38b25bfdac 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -5,7 +5,8 @@
 from typing import Callable, List, Optional, Union
 
 import torch
-from pydantic import conint
+from pydantic import Field
+from typing_extensions import Annotated
 
 _SAMPLING_EPS = 1e-5
 
@@ -127,7 +128,7 @@ def __init__(
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
         logits_processors: Optional[List[LogitsProcessor]] = None,
-        truncate_prompt_tokens: Optional[conint(ge=1)] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
     ) -> None:
         self.n = n
         self.best_of = best_of if best_of is not None else n
diff --git a/vllm/sequence.py b/vllm/sequence.py
index cdb6cce6f0255..dcde81df19923 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -171,10 +171,10 @@ def get_last_token_id(self) -> int:
             return self.prompt_token_ids[-1]
         return self.output_token_ids[-1]
 
-    def get_prompt_token_ids(self) -> int:
+    def get_prompt_token_ids(self) -> List[int]:
         return self.prompt_token_ids
 
-    def get_output_token_ids(self) -> int:
+    def get_output_token_ids(self) -> List[int]:
         return self.output_token_ids
 
     @property
@@ -370,7 +370,7 @@ class SequenceGroupState:
     """Mutable state tied to a specific sequence group"""
 
     # torch.Generator used in seeded sampling
-    generator: Optional = None
+    generator: Optional = None  # type: ignore
 
 
 class MultiModalData:
@@ -599,7 +599,7 @@ def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
     @property
-    def token_chunk_size(self) -> int:
+    def token_chunk_size(self) -> Optional[int]:
         """Return the number of tokens to be processed (chunk size)."""
         return self._token_chunk_size
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index ce7a30dce72fa..1756c91a612f0 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -2,7 +2,8 @@
 
 from transformers import AutoConfig, PretrainedConfig
 
-from vllm.transformers_utils.configs import *
+from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
+                                             JAISConfig, MPTConfig, RWConfig)
 
 _CONFIG_REGISTRY: Dict[str, PretrainedConfig] = {
     "chatglm": ChatGLMConfig,
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 005932f1e3df4..f064c26c3f40c 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -168,8 +168,8 @@ def _convert_tokens_to_string_with_added_encoders(
     # NOTE(woosuk): The following code is slow because it runs a for loop over
     # the output_tokens. In Python, running a for loop over a list can be slow
     # even when the loop body is very simple.
-    sub_texts = []
-    current_sub_text = []
+    sub_texts: List[str] = []
+    current_sub_text: List[str] = []
     all_special_tokens = set(tokenizer.all_special_tokens)
     for token in output_tokens:
         if skip_special_tokens and token in all_special_tokens:
@@ -263,6 +263,7 @@ def detokenize_incrementally(
              tokenizer,
              all_input_ids[:-1],
              skip_special_tokens=skip_special_tokens)
+    assert prev_tokens is not None
 
     # If the new token id is out of bounds, return an empty string.
     if new_token_id >= len(tokenizer):
@@ -271,6 +272,8 @@ def detokenize_incrementally(
         # Put new_token_id in a list so skip_special_tokens is respected
         new_tokens = tokenizer.convert_ids_to_tokens(
             [new_token_id], skip_special_tokens=skip_special_tokens)
+        if isinstance(new_tokens, str):
+            new_tokens = [new_tokens]
     output_tokens = prev_tokens + new_tokens
 
     # If this is the first iteration, return all tokens.
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index e216a99af91f9..5d3d5801c960d 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -5,7 +5,7 @@
 
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizers import *
+from vllm.transformers_utils.tokenizers import BaichuanTokenizer
 from vllm.utils import make_async
 
 logger = init_logger(__name__)
@@ -28,7 +28,7 @@ def get_cached_tokenizer(
     tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
     tokenizer_len = len(tokenizer)
 
-    class CachedTokenizer(tokenizer.__class__):
+    class CachedTokenizer(tokenizer.__class__):  # type: ignore
 
         @property
         def all_special_ids(self):
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 658fe5c98f5ee..b2672f7f1da61 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -7,7 +7,7 @@
 from enum import Enum
 from pathlib import Path
 from threading import Thread
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 from uuid import uuid4
 
 import cpuinfo
@@ -124,7 +124,7 @@ def __init__(self) -> None:
     def report_usage(self,
                      model_architecture: str,
                      usage_context: UsageContext,
-                     extra_kvs: Dict[str, any] = None) -> None:
+                     extra_kvs: Optional[Dict[str, Any]] = None) -> None:
         t = Thread(target=self._report_usage_worker,
                    args=(model_architecture, usage_context, extra_kvs or {}),
                    daemon=True)
@@ -132,13 +132,13 @@ def report_usage(self,
 
     def _report_usage_worker(self, model_architecture: str,
                              usage_context: UsageContext,
-                             extra_kvs: Dict[str, any]) -> None:
+                             extra_kvs: Dict[str, Any]) -> None:
         self._report_usage_once(model_architecture, usage_context, extra_kvs)
         self._report_continous_usage()
 
     def _report_usage_once(self, model_architecture: str,
                            usage_context: UsageContext,
-                           extra_kvs: Dict[str, any]) -> None:
+                           extra_kvs: Dict[str, Any]) -> None:
         # Platform information
         if torch.cuda.is_available():
             device_property = torch.cuda.get_device_properties(0)
diff --git a/vllm/utils.py b/vllm/utils.py
index e67d267aed408..f4aaf0cc51d7a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -60,7 +60,7 @@ def __contains__(self, key: Hashable) -> bool:
     def __len__(self) -> int:
         return len(self.cache)
 
-    def __getitem__(self, key: Hashable) -> T:
+    def __getitem__(self, key: Hashable) -> Optional[T]:
         return self.get(key)
 
     def __setitem__(self, key: Hashable, value: T) -> None:
@@ -76,7 +76,7 @@ def get(self,
             key: Hashable,
             default_value: Optional[T] = None) -> Optional[T]:
         if key in self.cache:
-            value = self.cache[key]
+            value: Optional[T] = self.cache[key]
             self.cache.move_to_end(key)
         else:
             value = default_value
@@ -87,7 +87,7 @@ def put(self, key: Hashable, value: T) -> None:
         self.cache.move_to_end(key)
         self._remove_old_if_needed()
 
-    def _on_remove(self, key: Hashable, value: T):
+    def _on_remove(self, key: Hashable, value: Optional[T]):
         pass
 
     def remove_oldest(self):
@@ -100,9 +100,11 @@ def _remove_old_if_needed(self) -> None:
         while len(self.cache) > self.capacity:
             self.remove_oldest()
 
-    def pop(self, key: Hashable, default_value: Optional[Any] = None) -> T:
+    def pop(self,
+            key: Hashable,
+            default_value: Optional[T] = None) -> Optional[T]:
         run_on_remove = key in self.cache
-        value = self.cache.pop(key, default_value)
+        value: Optional[T] = self.cache.pop(key, default_value)
         if run_on_remove:
             self._on_remove(key, value)
         return value

From 58911ecaee7ca50ad2a85820b6736a6b3aa9e9f6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Apr 2024 16:40:39 -0700
Subject: [PATCH 005/106] [Core] fix custom allreduce default value (#4040)

---
 vllm/entrypoints/llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 0694d3b7cdb91..5a8cd046df09c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -93,7 +93,7 @@ def __init__(
         swap_space: int = 4,
         enforce_eager: bool = False,
         max_context_len_to_capture: int = 8192,
-        disable_custom_all_reduce: bool = True,
+        disable_custom_all_reduce: bool = False,
         **kwargs,
     ) -> None:
         if "disable_log_stats" not in kwargs:

From 094013daad7496c495c70465d71eb676aa37850c Mon Sep 17 00:00:00 2001
From: Bellk17 <Kyletbell@ymail.com>
Date: Fri, 12 Apr 2024 16:41:26 -0700
Subject: [PATCH 006/106] Fix triton compilation issue (#3984)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/attention/ops/triton_flash_attention.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 87cf30cbef79a..e160411859f0b 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -415,7 +415,11 @@ def attn_fwd(
             return
 
     is_mqa = hq != hk
-    off_h_k = off_h_q % hk if is_mqa else off_h_q
+    if is_mqa:  # noqa: SIM108
+        off_h_k = off_h_q % hk
+    else:
+        off_h_k = off_h_q
+
     n_extra_tokens = 0
     if seqlen_k < BLOCK_N:
         n_extra_tokens = BLOCK_N - seqlen_k

From 0b5c9ea96f34d4a3aefb86e87a6540001d3b1914 Mon Sep 17 00:00:00 2001
From: Jee Li <pandaleefree@163.com>
Date: Sat, 13 Apr 2024 07:56:37 +0800
Subject: [PATCH 007/106] [Bugfix] Fix LoRA bug (#4032)

---
 vllm/lora/layers.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 4b9653de73a88..aac86351b15e1 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -32,14 +32,17 @@
 def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34
     """Returns the device for where to place the LoRA tensors."""
+    # unquantizedLinear
     if hasattr(base_layer, "weight"):
         return base_layer.weight.device
-    if hasattr(base_layer, "linear_weights") and isinstance(
-            base_layer.linear_weights, dict):
-        values = list(base_layer.linear_weights.values())
-        if len(values) and isinstance(values[0], torch.Tensor):
-            return values[0].device
-    raise ValueError(f"Unsupported base layer: {base_layer}")
+    # GPTQ/AWQ/SqueezeLLM
+    elif hasattr(base_layer, "qweight"):
+        return base_layer.qweight.device
+    # marlin
+    elif hasattr(base_layer, "B"):
+        return base_layer.B.device
+    else:
+        raise ValueError(f"Unsupported base layer: {base_layer}")
 
 
 def _apply_lora(

From b35bba783c7b7c87df9d54a59b0bd69b005ceeb1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Apr 2024 18:43:37 -0700
Subject: [PATCH 008/106] [CI/Test] expand ruff and yapf for all supported
 python version (#4037)

---
 .github/workflows/mypy.yaml | 2 +-
 .github/workflows/ruff.yml  | 2 +-
 .github/workflows/yapf.yml  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index fbe0f816fd4af..6db0bb7645ecd 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index e8060e369a889..e71033f828006 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index b163c960db555..04f307bcf8b0e 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}

From 03566840233019330619a49851dc1bac17a22914 Mon Sep 17 00:00:00 2001
From: Dylan Hawk <51147702+dylanwhawk@users.noreply.github.com>
Date: Fri, 12 Apr 2024 21:07:04 -0700
Subject: [PATCH 009/106] [Bugfix] More type hint fixes for py 3.8 (#4039)

---
 vllm/executor/executor_base.py | 2 +-
 vllm/worker/cpu_worker.py      | 4 ++--
 vllm/worker/neuron_worker.py   | 4 ++--
 vllm/worker/worker_base.py     | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 55bccfa8e3ca9..bbfbfc689c99f 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -39,7 +39,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         ExecutorBase may require modification of the result, e.g. to ensure the
         selected cache sizes are compatible with all workers.
 
-        Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
         are blocks that are "active" on the device and can be appended to.
         num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
         appended to.
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3989207e8dd83..41341b063bed7 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,5 @@
 """A CPU worker class."""
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -157,7 +157,7 @@ def init_device(self) -> None:
     def load_model(self):
         self.model_runner.load_model()
 
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of blocks available for the KV cache.
 
         This determines how many KV blocks can fit into the configured CPU
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 6136d50d0c068..2f22f82c045db 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,5 +1,5 @@
 """A Neuron worker class."""
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -40,7 +40,7 @@ def init_device(self) -> None:
     def load_model(self):
         self.model_runner.load_model()
 
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks.
 
         Swapping is not yet supported, so always return num_cpu_blocks=0.
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index e3027c406ffeb..d8c9febb11584 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
@@ -18,14 +18,14 @@ def init_device(self) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available blocks for the GPU KV cache and
         swappable CPU KV cache.
 
         The implementation may run profiling or other heuristics to determine
         the size of caches.
 
-        Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
         are blocks that are "active" on the device and can be appended to.
         num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
         appended to.

From 0f5a490e411a0db7c73bd798484749232617f94d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 13 Apr 2024 07:12:53 -0700
Subject: [PATCH 010/106] [Core][Distributed] improve logging for init dist
 (#4042)

---
 vllm/distributed/parallel_state.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 9fceffe7cb88b..1258bf58cb453 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -8,6 +8,10 @@
 
 import torch
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 # Tensor model parallel group that the current rank belongs to.
 _TENSOR_MODEL_PARALLEL_GROUP = None
 # Pipeline model parallel group that the current rank belongs to.
@@ -45,6 +49,8 @@ def init_distributed_environment(
     local_rank: int = -1,
     backend: str = "nccl",
 ):
+    logger.debug(f"{world_size=} {rank=} {local_rank=} "
+                 f"{distributed_init_method=} {backend=}")
     if not torch.distributed.is_initialized():
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "

From a738567cbda1923642e24aabec8302245fc9ba7b Mon Sep 17 00:00:00 2001
From: zspo <songpo.zhang@foxmail.com>
Date: Sat, 13 Apr 2024 22:52:36 +0800
Subject: [PATCH 011/106] [Bugfix] fix_log_time_in_metrics (#4050)

---
 vllm/engine/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 02560907a1282..04e27e69ce0f3 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -130,7 +130,7 @@ class StatLogger:
 
     def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
         # Metadata for logging locally.
-        self.last_local_log = time.monotonic()
+        self.last_local_log = time.time()
         self.local_interval = local_interval
 
         # Tracked stats over current local logging interval.

From 54448608680b13883b125060c536de8c733dbcff Mon Sep 17 00:00:00 2001
From: zspo <songpo.zhang@foxmail.com>
Date: Sat, 13 Apr 2024 22:54:03 +0800
Subject: [PATCH 012/106] [Bugfix] fix_small_bug_in_neuron_executor (#4051)

---
 vllm/executor/neuron_executor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index ee8e87432fa67..d45f18e466256 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -25,6 +25,7 @@ def __init__(
         speculative_config: Optional[SpeculativeConfig],
     ) -> None:
         self.model_config = model_config
+        self.cache_config = cache_config
         assert lora_config is None, "LoRA is not supported for Neuron backend."
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
@@ -43,6 +44,7 @@ def _init_worker(self):
             self.parallel_config,
             self.scheduler_config,
             self.device_config,
+            self.cache_config,
         )
         self.driver_worker.init_device()
         self.driver_worker.load_model()

From 7dd0af02c0020dd3ddcea64bf51d0b1d66f3faf4 Mon Sep 17 00:00:00 2001
From: Jee Li <pandaleefree@163.com>
Date: Sat, 13 Apr 2024 22:55:05 +0800
Subject: [PATCH 013/106] [Kernel] Add punica dimension for Baichuan-13B
 (#4053)

---
 csrc/punica/bgmv/bgmv_config.h | 1 +
 tests/lora/test_baichuan.py    | 2 +-
 tests/lora/test_punica.py      | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index 9b76b98ab3322..d2906914f927e 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -47,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 13696) \
     f(in_T, out_T, W_T, narrow, 13824) \
     f(in_T, out_T, W_T, narrow, 14336) \
+    f(in_T, out_T, W_T, narrow, 15360) \
     f(in_T, out_T, W_T, narrow, 16384) \
     f(in_T, out_T, W_T, narrow, 20480) \
     f(in_T, out_T, W_T, narrow, 22016) \
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 2178266d2e0c8..5ab863eea94b3 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -62,7 +62,7 @@ def test_baichuan_lora(baichuan_lora_files):
 
 
 @pytest.mark.skip("Requires multiple GPUs")
-def test_llama_tensor_parallel_equality(baichuan_lora_files):
+def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
     # Cannot use as it will initialize torch.cuda too early...
     # if torch.cuda.device_count() < 4:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index cab8b44ccd2df..8b174f01d87d4 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -72,6 +72,7 @@ def _lora_ref_impl(
     11008,
     13824,
     14336,
+    15360,
     22016,
     24576,
     27392,

From fab8ca1ecf2a271e73d16839a3d65e00cb676408 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Sat, 13 Apr 2024 20:13:01 -0400
Subject: [PATCH 014/106] [Frontend] [Core] feat: Add model loading using
 `tensorizer` (#3476)

---
 .buildkite/test-pipeline.yaml                 |   3 +
 docs/source/conf.py                           |   1 +
 docs/source/models/engine_args.rst            |   3 +-
 examples/tensorize_vllm_model.py              | 254 ++++++++++++++
 requirements-cpu.txt                          |   2 +-
 requirements-dev.txt                          |   1 +
 setup.py                                      |   3 +
 tests/tensorizer/__init__.py                  |   0
 .../tensorize_vllm_model_for_testing.py       | 245 ++++++++++++++
 tests/tensorizer/test_tensorizer.py           | 302 +++++++++++++++++
 vllm/config.py                                |  74 +++-
 vllm/engine/arg_utils.py                      |  45 ++-
 vllm/engine/llm_engine.py                     |   8 +-
 vllm/executor/gpu_executor.py                 |  23 +-
 vllm/executor/ray_gpu_executor.py             |   6 +-
 vllm/model_executor/model_loader.py           |  61 +++-
 vllm/model_executor/tensorizer_loader.py      | 319 ++++++++++++++++++
 vllm/model_executor/weight_utils.py           |  34 +-
 vllm/worker/model_runner.py                   |   9 +-
 vllm/worker/worker.py                         |   9 +-
 20 files changed, 1351 insertions(+), 51 deletions(-)
 create mode 100644 examples/tensorize_vllm_model.py
 create mode 100644 tests/tensorizer/__init__.py
 create mode 100644 tests/tensorizer/tensorize_vllm_model_for_testing.py
 create mode 100644 tests/tensorizer/test_tensorizer.py
 create mode 100644 vllm/model_executor/tensorizer_loader.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8d7d6304cf12e..aa4582bbda0c7 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -91,6 +91,9 @@ steps:
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
+- label: Tensorizer Test
+  command: apt-get install curl libsodium23 && pytest -v -s tensorizer
+
 - label: Metrics Test
   command: pytest -v -s metrics
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7a8c365ffb3bb..19cc8557a7541 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -83,6 +83,7 @@
     "vllm._C",
     "numpy",
     "tqdm",
+    "tensorizer",
 ]
 
 for mock_target in autodoc_mock_imports:
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index d8a7ac72e0175..886a806934c04 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -36,7 +36,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 
     Directory to download and load the weights, default to the default cache dir of huggingface.
 
-.. option:: --load-format {auto,pt,safetensors,npcache,dummy}
+.. option:: --load-format {auto,pt,safetensors,npcache,dummy,tensorizer}
 
     The format of the model weights to load.
 
@@ -45,6 +45,7 @@ Below, you can find an explanation of every engine argument for vLLM:
     * "safetensors" will load the weights in the safetensors format.
     * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
     * "dummy" will initialize the weights with random values, mainly for profiling.
+    * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_. See `tensorized_vllm_model.py` in the examples folder to serialize a vLLM model, and for more information. Tensorizer support for vLLM can be installed with `pip install vllm[tensorizer]`.
 
 .. option:: --dtype {auto,half,float16,bfloat16,float,float32}
 
diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
new file mode 100644
index 0000000000000..3c20a38c7f726
--- /dev/null
+++ b/examples/tensorize_vllm_model.py
@@ -0,0 +1,254 @@
+import argparse
+import dataclasses
+import os
+import time
+import uuid
+from functools import partial
+from typing import Type
+
+import torch
+import torch.nn as nn
+from tensorizer import (DecryptionParams, EncryptionParams, TensorDeserializer,
+                        TensorSerializer, stream_io)
+from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
+from transformers import AutoConfig, PretrainedConfig
+
+from vllm.distributed import initialize_model_parallel
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.tensorizer_loader import TensorizerArgs
+
+# yapf conflicts with isort for this docstring
+# yapf: disable
+"""
+tensorize_vllm_model.py is a script that can be used to serialize and 
+deserialize vLLM models. These models can be loaded using tensorizer directly 
+to the GPU extremely quickly. Tensor encryption and decryption is also 
+supported, although libsodium must be installed to use it. Install
+vllm with tensorizer support using `pip install vllm[tensorizer]`.
+
+To serialize a model, you can run something like this:
+
+python tensorize_vllm_model.py \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   serialize \
+   --serialized-directory s3://my-bucket/ \
+   --suffix vllm
+   
+Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
+and saves it to your S3 bucket. A local directory can also be used.
+
+You can also encrypt the model weights with a randomly-generated key by 
+providing a `--keyfile` argument.
+
+To deserialize a model, you can run something like this:
+
+python tensorize_vllm_model.py \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   deserialize \
+   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
+
+Which downloads the model tensors from your S3 bucket and deserializes them.
+To provide S3 credentials, you can provide `--s3-access-key-id` and 
+`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
+the OpenAI entrypoint, as arguments for LLM(), or as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
+
+
+You can also provide a `--keyfile` argument to decrypt the model weights if 
+they were serialized with encryption.
+
+For more information on the available arguments, run 
+`python tensorize_vllm_model.py --help`.
+"""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="An example script that can be used to serialize and "
+        "deserialize vLLM models. These models "
+        "can be loaded using tensorizer directly to the GPU "
+        "extremely quickly. Tensor encryption and decryption is "
+        "also supported, although libsodium must be installed to "
+        "use it.")
+    parser = EngineArgs.add_cli_args(parser)
+    subparsers = parser.add_subparsers(dest='command')
+
+    serialize_parser = subparsers.add_parser(
+        'serialize', help="Serialize a model to `--serialized-directory`")
+
+    serialize_parser.add_argument(
+        "--suffix",
+        type=str,
+        required=False,
+        help=(
+            "The suffix to append to the serialized model directory, which is "
+            "used to construct the location of the serialized model tensors, "
+            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
+            "`--suffix` is `v1`, the serialized model tensors will be "
+            "saved to "
+            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
+            "If none is provided, a random UUID will be used."))
+    serialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=True,
+        help="The directory to serialize the model to. "
+        "This can be a local directory or S3 URI. The path to where the "
+        "tensors are saved is a combination of the supplied `dir` and model "
+        "reference ID. For instance, if `dir` is the serialized directory, "
+        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
+        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
+        "where `suffix` is given by `--suffix` or a random UUID if not "
+        "provided.")
+
+    serialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Encrypt the model weights with a randomly-generated binary key,"
+              " and save the key at this path"))
+
+    deserialize_parser = subparsers.add_parser(
+        'deserialize',
+        help=("Deserialize a model from `--path-to-tensors`"
+              " to verify it can be loaded and used."))
+
+    deserialize_parser.add_argument(
+        "--path-to-tensors",
+        type=str,
+        required=True,
+        help="The local path or S3 URI to the model tensors to deserialize. ")
+
+    deserialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Path to a binary key to use to decrypt the model weights,"
+              " if the model was serialized with encryption"))
+
+    return parser.parse_args()
+
+
+def make_model_contiguous(model):
+    # Ensure tensors are saved in memory contiguously
+    for param in model.parameters():
+        param.data = param.data.contiguous()
+
+
+def _get_vllm_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        model_cls = ModelRegistry.load_model_cls(arch)
+        if model_cls is not None:
+            return model_cls
+    raise ValueError(
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def serialize():
+
+    eng_args_dict = {f.name: getattr(args, f.name) for f in
+                     dataclasses.fields(EngineArgs)}
+    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
+    engine = LLMEngine.from_engine_args(engine_args)
+
+    model = (engine.model_executor.driver_worker.
+             model_runner.model)
+
+    encryption_params = EncryptionParams.random() if keyfile else None
+    if keyfile:
+        with _write_stream(keyfile) as stream:
+            stream.write(encryption_params.key)
+
+    with _write_stream(model_path) as stream:
+        serializer = TensorSerializer(stream, encryption=encryption_params)
+        serializer.write_module(model)
+        serializer.close()
+
+    print("Serialization complete. Model tensors saved to", model_path)
+    if keyfile:
+        print("Key saved to", keyfile)
+
+
+def deserialize():
+    config = AutoConfig.from_pretrained(model_ref)
+
+    with no_init_or_tensor():
+        model_class = _get_vllm_model_architecture(config)
+        model = model_class(config)
+
+    before_mem = get_mem_usage()
+    start = time.time()
+
+    if keyfile:
+        with _read_stream(keyfile) as stream:
+            key = stream.read()
+            decryption_params = DecryptionParams.from_key(key)
+            tensorizer_args.deserializer_params['encryption'] = \
+                decryption_params
+
+    with (_read_stream(model_path)) as stream, TensorDeserializer(
+            stream, **tensorizer_args.deserializer_params) as deserializer:
+        deserializer.load_into_module(model)
+        end = time.time()
+
+    # Brag about how fast we are.
+    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+    duration = end - start
+    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+    after_mem = get_mem_usage()
+    print(
+        f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s"
+    )
+    print(f"Memory usage before: {before_mem}")
+    print(f"Memory usage after: {after_mem}")
+
+    return model
+
+
+args = parse_args()
+
+s3_access_key_id = (args.s3_access_key_id or os.environ.get("S3_ACCESS_KEY_ID")
+                    or None)
+s3_secret_access_key = (args.s3_secret_access_key
+                        or os.environ.get("S3_SECRET_ACCESS_KEY") or None)
+
+s3_endpoint = (args.s3_endpoint or os.environ.get("S3_ENDPOINT_URL") or None)
+
+_read_stream, _write_stream = (partial(
+    stream_io.open_stream,
+    mode=mode,
+    s3_access_key_id=s3_access_key_id,
+    s3_secret_access_key=s3_secret_access_key,
+    s3_endpoint=s3_endpoint,
+) for mode in ("rb", "wb+"))
+
+model_ref = args.model
+
+model_name = model_ref.split("/")[1]
+
+os.environ["MASTER_ADDR"] = "127.0.0.1"
+os.environ["MASTER_PORT"] = "8080"
+
+torch.distributed.init_process_group(world_size=1, rank=0)
+initialize_model_parallel()
+
+keyfile = args.keyfile if args.keyfile else None
+
+if args.command == "serialize":
+    input_dir = args.serialized_directory.rstrip('/')
+    suffix = args.suffix if args.suffix else uuid.uuid4().hex
+    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+    model_path = f"{base_path}/model.tensors"
+    serialize()
+elif args.command == "deserialize":
+    tensorizer_args = TensorizerArgs.from_cli_args(args)
+    model_path = args.path_to_tensors
+    deserialize()
+else:
+    raise ValueError("Either serialize or deserialize must be specified.")
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 36d20bc9473ea..5779b38b24e69 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,4 +3,4 @@
 
 # Dependencies for x86_64 CPUs
 torch == 2.2.1+cpu
-triton >= 2.1.0  # FIXME(woosuk): This is a hack to avoid import error.
+triton >= 2.1.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 96dfda6faf00f..1317e51b2dd11 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -14,6 +14,7 @@ types-setuptools
 
 # testing
 pytest
+tensorizer==2.9.0a0
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
diff --git a/setup.py b/setup.py
index 92e2b418c9419..5d9a646b11b0b 100644
--- a/setup.py
+++ b/setup.py
@@ -428,6 +428,9 @@ def get_extra_requirements() -> dict:
     install_requires=get_requirements(),
     extras_require=get_extra_requirements(),
     ext_modules=ext_modules,
+    extras_require={
+        "optional": ["tensorizer==2.9.0a1"],
+    },
     cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
     package_data=package_data,
 )
diff --git a/tests/tensorizer/__init__.py b/tests/tensorizer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tensorizer/tensorize_vllm_model_for_testing.py b/tests/tensorizer/tensorize_vllm_model_for_testing.py
new file mode 100644
index 0000000000000..d0be08329fd64
--- /dev/null
+++ b/tests/tensorizer/tensorize_vllm_model_for_testing.py
@@ -0,0 +1,245 @@
+import argparse
+import dataclasses
+import os
+import time
+import uuid
+from functools import partial
+from typing import Type
+
+import torch
+import torch.nn as nn
+from tensorizer import (DecryptionParams, EncryptionParams, TensorDeserializer,
+                        TensorSerializer, stream_io)
+from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
+from transformers import AutoConfig, PretrainedConfig
+
+from vllm.distributed import initialize_model_parallel
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.tensorizer_loader import TensorizerArgs
+
+# yapf conflicts with isort for this docstring
+# yapf: disable
+"""
+tensorize_vllm_model.py is a script that can be used to serialize and 
+deserialize vLLM models. These models can be loaded using tensorizer directly 
+to the GPU extremely quickly. Tensor encryption and decryption is also 
+supported, although libsodium must be installed to use it. Install
+vllm with tensorizer support using `pip install vllm[tensorizer]`.
+
+To serialize a model, you can run something like this:
+
+python tensorize_vllm_model.py \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   serialize \
+   --serialized-directory s3://my-bucket/ \
+   --suffix vllm
+
+Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
+and saves it to your S3 bucket. A local directory can also be used.
+
+You can also encrypt the model weights with a randomly-generated key by 
+providing a `--keyfile` argument.
+
+To deserialize a model, you can run something like this:
+
+python tensorize_vllm_model.py \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   deserialize \
+   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
+
+Which downloads the model tensors from your S3 bucket and deserializes them.
+To provide S3 credentials, you can provide `--s3-access-key-id` and 
+`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
+the OpenAI entrypoint, as arguments for LLM(), or as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
+
+
+You can also provide a `--keyfile` argument to decrypt the model weights if 
+they were serialized with encryption.
+
+For more information on the available arguments, run 
+`python tensorize_vllm_model.py --help`.
+"""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="An example script that can be used to serialize and "
+                    "deserialize vLLM models. These models "
+                    "can be loaded using tensorizer directly to the GPU "
+                    "extremely quickly. Tensor encryption and decryption is "
+                    "also supported, although libsodium must be installed to "
+                    "use it.")
+    parser = EngineArgs.add_cli_args(parser)
+    subparsers = parser.add_subparsers(dest='command')
+
+    serialize_parser = subparsers.add_parser(
+        'serialize', help="Serialize a model to `--serialized-directory`")
+
+    serialize_parser.add_argument(
+        "--suffix",
+        type=str,
+        required=False,
+        help=(
+            "The suffix to append to the serialized model directory, which is "
+            "used to construct the location of the serialized model tensors, "
+            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
+            "`--suffix` is `v1`, the serialized model tensors will be "
+            "saved to "
+            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
+            "If none is provided, a random UUID will be used."))
+    serialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=True)
+
+    serialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Encrypt the model weights with a randomly-generated binary key,"
+              " and save the key at this path"))
+
+    deserialize_parser = subparsers.add_parser(
+        'deserialize',
+        help=("Deserialize a model from `--path-to-tensors`"
+              " to verify it can be loaded and used."))
+
+    deserialize_parser.add_argument(
+        "--path-to-tensors",
+        type=str,
+        required=True,
+        help="The local path or S3 URI to the model tensors to deserialize. ")
+
+    deserialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Path to a binary key to use to decrypt the model weights,"
+              " if the model was serialized with encryption"))
+
+    return parser.parse_args()
+
+
+def make_model_contiguous(model):
+    # Ensure tensors are saved in memory contiguously
+    for param in model.parameters():
+        param.data = param.data.contiguous()
+
+
+def _get_vllm_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        model_cls = ModelRegistry.load_model_cls(arch)
+        if model_cls is not None:
+            return model_cls
+    raise ValueError(
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def serialize():
+    eng_args_dict = {f.name: getattr(args, f.name) for f in
+                     dataclasses.fields(EngineArgs)}
+    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
+    engine = LLMEngine.from_engine_args(engine_args)
+
+    model = (engine.model_executor.driver_worker.
+             model_runner.model)
+
+    encryption_params = EncryptionParams.random() if keyfile else None
+    if keyfile:
+        with _write_stream(keyfile) as stream:
+            stream.write(encryption_params.key)
+
+    with _write_stream(model_path) as stream:
+        serializer = TensorSerializer(stream, encryption=encryption_params)
+        serializer.write_module(model)
+        serializer.close()
+
+    print("Serialization complete. Model tensors saved to", model_path)
+    if keyfile:
+        print("Key saved to", keyfile)
+
+
+def deserialize():
+    config = AutoConfig.from_pretrained(model_ref)
+
+    with no_init_or_tensor():
+        model_class = _get_vllm_model_architecture(config)
+        model = model_class(config)
+
+    before_mem = get_mem_usage()
+    start = time.time()
+
+    if keyfile:
+        with _read_stream(keyfile) as stream:
+            key = stream.read()
+            decryption_params = DecryptionParams.from_key(key)
+            tensorizer_args.deserializer_params['encryption'] = \
+                decryption_params
+
+    with (_read_stream(model_path)) as stream, TensorDeserializer(
+            stream, **tensorizer_args.deserializer_params) as deserializer:
+        deserializer.load_into_module(model)
+        end = time.time()
+
+    # Brag about how fast we are.
+    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+    duration = end - start
+    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+    after_mem = get_mem_usage()
+    print(
+        f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s"
+    )
+    print(f"Memory usage before: {before_mem}")
+    print(f"Memory usage after: {after_mem}")
+
+    return model
+
+
+args = parse_args()
+
+s3_access_key_id = (args.s3_access_key_id or os.environ.get("S3_ACCESS_KEY_ID")
+                    or None)
+s3_secret_access_key = (args.s3_secret_access_key
+                        or os.environ.get("S3_SECRET_ACCESS_KEY") or None)
+
+s3_endpoint = (args.s3_endpoint or os.environ.get("S3_ENDPOINT_URL") or None)
+
+_read_stream, _write_stream = (partial(
+    stream_io.open_stream,
+    mode=mode,
+    s3_access_key_id=s3_access_key_id,
+    s3_secret_access_key=s3_secret_access_key,
+    s3_endpoint=s3_endpoint,
+) for mode in ("rb", "wb+"))
+
+model_ref = args.model
+
+model_name = model_ref.split("/")[1]
+
+os.environ["MASTER_ADDR"] = "127.0.0.1"
+os.environ["MASTER_PORT"] = "8080"
+
+torch.distributed.init_process_group(world_size=1, rank=0)
+initialize_model_parallel()
+
+keyfile = args.keyfile if args.keyfile else None
+
+if args.command == "serialize":
+    input_dir = args.serialized_directory.rstrip('/')
+    suffix = args.suffix if args.suffix else uuid.uuid4().hex
+    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+    model_path = f"{base_path}/model.tensors"
+    serialize()
+elif args.command == "deserialize":
+    tensorizer_args = TensorizerArgs.from_cli_args(args)
+    model_path = args.path_to_tensors
+    deserialize()
+else:
+    raise ValueError("Either serialize or deserialize must be specified.")
diff --git a/tests/tensorizer/test_tensorizer.py b/tests/tensorizer/test_tensorizer.py
new file mode 100644
index 0000000000000..2ab893e95da9c
--- /dev/null
+++ b/tests/tensorizer/test_tensorizer.py
@@ -0,0 +1,302 @@
+import gc
+import subprocess
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from tests.entrypoints.test_openai_server import ServerRunner
+from vllm import SamplingParams
+from vllm.config import TensorizerConfig
+from vllm.model_executor.tensorizer_loader import (
+    EncryptionParams, TensorSerializer, is_vllm_serialized_tensorizer,
+    load_with_tensorizer, open_stream)
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+model_ref = "facebook/opt-125m"
+
+
+def is_curl_installed():
+    try:
+        subprocess.check_call(['curl', '--version'])
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+
+
+@pytest.fixture(autouse=True)
+def tensorizer_config():
+    config = TensorizerConfig(tensorizer_uri="vllm", vllm_tensorized=True)
+    return config
+
+
+@patch('vllm.model_executor.tensorizer_loader.TensorizerAgent')
+def test_load_with_tensorizer(mock_agent, tensorizer_config):
+    mock_linear_method = MagicMock()
+    mock_agent_instance = mock_agent.return_value
+    mock_agent_instance.deserialize.return_value = MagicMock()
+
+    result = load_with_tensorizer(tensorizer_config,
+                                  linear_method=mock_linear_method)
+
+    mock_agent.assert_called_once_with(tensorizer_config,
+                                       linear_method=mock_linear_method)
+    mock_agent_instance.deserialize.assert_called_once()
+    assert result == mock_agent_instance.deserialize.return_value
+
+
+def test_is_vllm_model_with_vllm_in_uri(tensorizer_config):
+    tensorizer_config.vllm_tensorized = True
+
+    result = is_vllm_serialized_tensorizer(tensorizer_config)
+
+    assert result is True
+
+
+def test_is_vllm_model_without_vllm_in_uri(tensorizer_config):
+    tensorizer_config.vllm_tensorized = False
+
+    result = is_vllm_serialized_tensorizer(tensorizer_config)
+
+    assert result is False
+
+
+def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path):
+    vllm_model = vllm_runner(model_ref)
+    model_path = tmp_path / (model_ref + ".tensors")
+    outputs = vllm_model.generate(prompts, sampling_params)
+    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+             model_runner.model)
+    with open_stream(model_path, "wb+") as stream:
+        serializer = TensorSerializer(stream)
+        serializer.write_module(model)
+    del vllm_model, model
+    gc.collect()
+    torch.cuda.empty_cache()
+    loaded_vllm_model = vllm_runner(model_ref,
+                                    load_format="tensorizer",
+                                    tensorizer_uri=model_path,
+                                    num_readers=1,
+                                    vllm_tensorized=True)
+    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+
+    # Assumes SamplingParams being seeded ensures the outputs are deterministic
+    assert outputs == deserialized_outputs
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_can_deserialize_s3(vllm_runner):
+    model_ref = "EleutherAI/pythia-1.4b"
+    tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
+
+    loaded_hf_model = vllm_runner(
+        model_ref,
+        tensorizer_uri=tensorized_path,
+        load_format="tensorizer",
+        num_readers=1,
+        vllm_tensorized=False,
+        s3_endpoint="object.ord1.coreweave.com",
+    )
+
+    deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
+
+    assert deserialized_outputs
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_deserialized_encrypted_vllm_model_has_same_outputs(
+        vllm_runner, tmp_path):
+    vllm_model = vllm_runner(model_ref)
+    model_path = tmp_path / (model_ref + ".tensors")
+    key_path = tmp_path / (model_ref + ".key")
+    outputs = vllm_model.generate(prompts, sampling_params)
+    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+             model_runner.model)
+
+    encryption_params = EncryptionParams.random()
+    with open_stream(model_path, "wb+") as stream:
+        serializer = TensorSerializer(stream, encryption=encryption_params)
+        serializer.write_module(model)
+    with open_stream(key_path, "wb+") as stream:
+        stream.write(encryption_params.key)
+    del vllm_model, model
+    gc.collect()
+    torch.cuda.empty_cache()
+    loaded_vllm_model = vllm_runner(model_ref,
+                                    tensorizer_uri=model_path,
+                                    load_format="tensorizer",
+                                    encryption_keyfile=key_path,
+                                    num_readers=1,
+                                    vllm_tensorized=True)
+
+    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+
+    # Assumes SamplingParams being seeded ensures the outputs are deterministic
+    assert outputs == deserialized_outputs
+
+
+def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
+                                                tmp_path):
+    hf_model = hf_runner(model_ref)
+    model_path = tmp_path / (model_ref + ".tensors")
+    max_tokens = 50
+    outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
+    with open_stream(model_path, "wb+") as stream:
+        serializer = TensorSerializer(stream)
+        serializer.write_module(hf_model.model)
+    del hf_model
+    gc.collect()
+    torch.cuda.empty_cache()
+    loaded_hf_model = vllm_runner(model_ref,
+                                  tensorizer_uri=model_path,
+                                  load_format="tensorizer",
+                                  num_readers=1,
+                                  vllm_tensorized=False)
+
+    deserialized_outputs = loaded_hf_model.generate_greedy(
+        prompts, max_tokens=max_tokens)
+
+    assert outputs == deserialized_outputs
+
+
+def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
+    from huggingface_hub import snapshot_download
+
+    from examples.multilora_inference import (create_test_prompts,
+                                              process_requests)
+
+    model_ref = "meta-llama/Llama-2-7b-hf"
+    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    test_prompts = create_test_prompts(lora_path)
+
+    # Serialize model before deserializing and binding LoRA adapters
+    vllm_model = vllm_runner(model_ref, )
+    model_path = tmp_path / (model_ref + ".tensors")
+    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+             model_runner.model)
+    with open_stream(model_path, "wb+") as stream:
+        serializer = TensorSerializer(stream)
+        serializer.write_module(model)
+    del vllm_model, model
+    gc.collect()
+    torch.cuda.empty_cache()
+    loaded_vllm_model = vllm_runner(
+        model_ref,
+        tensorizer_uri=model_path,
+        load_format="tensorizer",
+        num_readers=1,
+        vllm_tensorized=True,
+        enable_lora=True,
+        max_loras=1,
+        max_lora_rank=8,
+        max_cpu_loras=2,
+        max_num_seqs=50,
+        max_model_len=1000,
+    )
+    process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
+
+    assert loaded_vllm_model
+
+
+def test_load_without_tensorizer_load_format(vllm_runner):
+    with pytest.raises(ValueError):
+        vllm_runner(model_ref, tensorizer_uri="test")
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_tensorize_vllm_model(tmp_path):
+    # Test serialize command
+    serialize_args = [
+        "python3", "tensorizer/tensorize_vllm_model_for_testing.py", "--model",
+        model_ref, "--dtype", "float16", "serialize", "--serialized-directory",
+        tmp_path, "--suffix", "tests"
+    ]
+    result = subprocess.run(serialize_args, capture_output=True, text=True)
+    print(result.stdout)  # Print the output of the serialize command
+
+    assert result.returncode == 0, (f"Serialize command failed with output:"
+                                    f"\n{result.stdout}\n{result.stderr}")
+
+    path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
+
+    # Test deserialize command
+    deserialize_args = [
+        "python3", "tensorizer/tensorize_vllm_model_for_testing.py", "--model",
+        model_ref, "--dtype", "float16", "deserialize", "--path-to-tensors",
+        path_to_tensors
+    ]
+    result = subprocess.run(deserialize_args, capture_output=True, text=True)
+    assert result.returncode == 0, (f"Deserialize command failed with output:"
+                                    f"\n{result.stdout}\n{result.stderr}")
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_openai_apiserver_with_tensorizer(tmp_path):
+    ## Serialize model
+    serialize_args = [
+        "python3", "tensorizer/tensorize_vllm_model_for_testing.py", "--model",
+        model_ref, "--dtype", "float16", "serialize", "--serialized-directory",
+        tmp_path, "--suffix", "tests"
+    ]
+    result = subprocess.run(serialize_args, capture_output=True, text=True)
+    print(result.stdout)  # Print the output of the serialize command
+
+    assert result.returncode == 0, (f"Serialize command failed with output:"
+                                    f"\n{result.stdout}\n{result.stderr}")
+
+    path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
+
+    ## Start OpenAI API server
+    openai_args = [
+        "--model", model_ref, "--dtype", "float16", "--load-format",
+        "tensorizer", "--tensorizer-uri", path_to_tensors, "--vllm-tensorized",
+        "--port", "8000"
+    ]
+
+    server = ServerRunner.remote(openai_args)
+
+    print("Server ready.")
+    assert server.ready.remote()
+
+
+def test_raise_value_error_on_invalid_load_format(vllm_runner):
+    with pytest.raises(ValueError):
+        vllm_runner(model_ref,
+                    load_format="safetensors",
+                    tensorizer_uri="test")
+
+
+def test_tensorizer_with_tp(vllm_runner):
+    with pytest.raises(ValueError):
+        model_ref = "EleutherAI/pythia-1.4b"
+        tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
+
+        vllm_runner(
+            model_ref,
+            tensorizer_uri=tensorized_path,
+            load_format="tensorizer",
+            num_readers=1,
+            vllm_tensorized=False,
+            s3_endpoint="object.ord1.coreweave.com",
+            tensor_parallel_size=2,
+        )
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_tensorizer_warn_quant(tmp_path):
+    model_ref = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+    serialize_args = [
+        "python3", "tensorizer/tensorize_vllm_model_for_testing.py", "--model",
+        model_ref, "--quantization", "gptq", "--tensorizer-uri", "test",
+        "serialize", "--serialized-directory", tmp_path, "--suffix", "tests"
+    ]
+    result = subprocess.run(serialize_args, capture_output=True, text=True)
+    assert 'PerformanceWarning' in result.stderr
diff --git a/vllm/config.py b/vllm/config.py
index ca3f004d0dd4c..a626933713708 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,6 +1,8 @@
 import enum
+import io
 import json
 import os
+import typing
 from dataclasses import dataclass, fields
 from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
 
@@ -16,6 +18,8 @@
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
 
+    from vllm.model_executor.tensorizer_loader import TensorizerArgs
+
 logger = init_logger(__name__)
 
 _GB = 1 << 30
@@ -145,13 +149,14 @@ def __init__(
     def _verify_load_format(self) -> None:
         load_format = self.load_format.lower()
         supported_load_format = [
-            "auto", "pt", "safetensors", "npcache", "dummy"
+            "auto", "pt", "safetensors", "npcache", "dummy", "tensorizer"
         ]
         rocm_not_supported_load_format: List[str] = []
         if load_format not in supported_load_format:
             raise ValueError(
                 f"Unknown load format: {self.load_format}. Must be one of "
-                "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.")
+                "'auto', 'pt', 'safetensors', 'npcache', 'tensorizer', or "
+                "'dummy'.")
         if is_hip() and load_format in rocm_not_supported_load_format:
             rocm_supported_load_format = [
                 f for f in supported_load_format
@@ -922,6 +927,65 @@ def get_image_input_enum_type(
                              f"{[x.name for x in cls.ImageInputType]}.") from e
 
 
+@dataclass
+class TensorizerConfig:
+    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
+                          str, bytes, os.PathLike, int]
+    vllm_tensorized: bool
+    verify_hash: Optional[bool] = False
+    num_readers: Optional[int] = 1
+    encryption_keyfile: Optional[str] = None
+    s3_access_key_id: Optional[str] = None
+    s3_secret_access_key: Optional[str] = None
+    s3_endpoint: Optional[str] = None
+    model_class: Optional[torch.nn.Module] = None
+    hf_config: Optional[PretrainedConfig] = None
+    dtype: Union[str, torch.dtype] = None
+
+    def _construct_tensorizer_args(self) -> "TensorizerArgs":
+        from vllm.model_executor.tensorizer_loader import TensorizerArgs
+        tensorizer_args = {
+            "tensorizer_uri": self.tensorizer_uri,
+            "vllm_tensorized": self.vllm_tensorized,
+            "verify_hash": self.verify_hash,
+            "num_readers": self.num_readers,
+            "encryption_keyfile": self.encryption_keyfile,
+            "s3_access_key_id": self.s3_access_key_id,
+            "s3_secret_access_key": self.s3_secret_access_key,
+            "s3_endpoint": self.s3_endpoint,
+        }
+        return TensorizerArgs(**tensorizer_args)
+
+    def verify_with_parallel_config(
+        self,
+        parallel_config: "ParallelConfig",
+    ) -> None:
+        if (parallel_config.tensor_parallel_size > 1
+                and self.tensorizer_uri is not None):
+            raise ValueError(
+                "Loading to multiple GPUs is not currently supported with "
+                "vLLM-serialized models. Please set tensor_parallel_size=1."
+                " or use a non-vLLM-serialized model, such as a "
+                "serialized Hugging Face `PretrainedModel`.")
+
+    def verify_with_model_config(self, model_config) -> None:
+        if (model_config.quantization is not None
+                and self.tensorizer_uri is not None):
+            from vllm.model_executor.tensorizer_loader import (
+                tensorizer_warning)
+            tensorizer_warning(
+                "Loading a model using Tensorizer with quantization on vLLM"
+                " is unstable and may lead to errors.")
+
+        if (model_config.load_format != "tensorizer"
+                and self.tensorizer_uri is not None):
+            raise ValueError(
+                "A tensorizer uri was passed for tensorizer loading, but the "
+                f"load format was set to {model_config.load_format}. "
+                "Please set the load format to 'tensorizer' to use "
+                f"tensorizer args.")
+
+
 _STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.float16,
     "float16": torch.float16,
@@ -1069,6 +1133,7 @@ class EngineConfig:
     lora_config: Optional[LoRAConfig]
     vision_language_config: Optional[VisionLanguageConfig]
     speculative_config: Optional[SpeculativeConfig]
+    tensorizer_config: Optional[TensorizerConfig]
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
@@ -1076,6 +1141,11 @@ def __post_init__(self):
         self.model_config.verify_with_parallel_config(self.parallel_config)
         self.cache_config.verify_with_parallel_config(self.parallel_config)
 
+        if self.tensorizer_config:
+            self.tensorizer_config.verify_with_parallel_config(
+                self.parallel_config)
+            self.tensorizer_config.verify_with_model_config(self.model_config)
+
         if self.lora_config:
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 12ef26cf83180..90453adabf379 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -2,13 +2,16 @@
 
 import argparse
 import dataclasses
+import io
+import os
 from dataclasses import dataclass
-from typing import Optional
+from typing import BinaryIO, Optional, Union
 
 from vllm.config import (CacheConfig, DeviceConfig, EngineConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, TokenizerPoolConfig,
-                         VisionLanguageConfig)
+                         SpeculativeConfig, TensorizerConfig,
+                         TokenizerPoolConfig, VisionLanguageConfig)
+from vllm.model_executor.tensorizer_loader import TensorizerArgs
 from vllm.utils import str_to_int_tuple
 
 
@@ -62,12 +65,22 @@ class EngineArgs:
     num_gpu_blocks_override: Optional[int] = None
     num_lookahead_slots: int = 0
 
+    # Tensorizer configuration parameters
+    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
+                          bytes, os.PathLike, int] = None
+    vllm_tensorized: bool = False
+    verify_hash: Optional[bool] = False
+    num_readers: Optional[int] = 1
+    encryption_keyfile: Optional[str] = None
+    s3_access_key_id: Optional[str] = None
+    s3_secret_access_key: Optional[str] = None
+    s3_endpoint: Optional[str] = None
+
     # Related to Vision-language models such as llava
     image_input_type: Optional[str] = None
     image_token_id: Optional[int] = None
     image_input_shape: Optional[str] = None
     image_feature_size: Optional[int] = None
-
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
 
@@ -139,7 +152,9 @@ def add_cli_args(
             '--load-format',
             type=str,
             default=EngineArgs.load_format,
-            choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
+            choices=[
+                'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer'
+            ],
             help='The format of the model weights to load. '
             '"auto" will try to load the weights in the safetensors format '
             'and fall back to the pytorch bin format if safetensors format '
@@ -149,7 +164,10 @@ def add_cli_args(
             '"npcache" will load the weights in pytorch format and store '
             'a numpy cache to speed up the loading. '
             '"dummy" will initialize the weights with random values, '
-            'which is mainly for profiling.')
+            'which is mainly for profiling.'
+            '"tensorizer" will load the weights using tensorizer from CoreWeave'
+            'which assumes tensorizer_uri is set to the location of the '
+            'serialized weights.')
         parser.add_argument(
             '--dtype',
             type=str,
@@ -418,6 +436,7 @@ def add_cli_args(
             default=None,
             help='The number of speculative tokens to sample from '
             'the draft model in speculative decoding')
+        parser = TensorizerArgs.add_cli_args(parser)
         return parser
 
     @classmethod
@@ -493,6 +512,17 @@ def create_engine_config(self, ) -> EngineConfig:
             max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
             and self.max_cpu_loras > 0 else None) if self.enable_lora else None
 
+        tensorizer_config = TensorizerConfig(
+            tensorizer_uri=self.tensorizer_uri,
+            vllm_tensorized=self.vllm_tensorized,
+            verify_hash=self.verify_hash,
+            num_readers=self.num_readers,
+            encryption_keyfile=self.encryption_keyfile,
+            s3_access_key_id=self.s3_access_key_id,
+            s3_secret_access_key=self.s3_secret_access_key,
+            s3_endpoint=self.s3_endpoint,
+        )
+
         if self.image_input_type:
             if (not self.image_token_id or not self.image_input_shape
                     or not self.image_feature_size):
@@ -516,7 +546,8 @@ def create_engine_config(self, ) -> EngineConfig:
                             device_config=device_config,
                             lora_config=lora_config,
                             vision_language_config=vision_language_config,
-                            speculative_config=speculative_config)
+                            speculative_config=speculative_config,
+                            tensorizer_config=tensorizer_config)
 
 
 @dataclass
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 15d7eabbf9de6..c02234f7fe3f5 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -6,7 +6,7 @@
 import vllm
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VisionLanguageConfig)
+                         TensorizerConfig, VisionLanguageConfig)
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics import StatLogger, Stats
@@ -74,6 +74,7 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         vision_language_config: Optional[VisionLanguageConfig],
         speculative_config: Optional[SpeculativeConfig],
+        tensorizer_config: Optional[TensorizerConfig],
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -112,6 +113,7 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.speculative_config = speculative_config
+        self.tensorizer_config = tensorizer_config
         self.log_stats = log_stats
 
         self._init_tokenizer()
@@ -127,6 +129,7 @@ def __init__(
             lora_config=lora_config,
             vision_language_config=vision_language_config,
             speculative_config=speculative_config,
+            tensorizer_config=tensorizer_config,
         )
 
         self._initialize_kv_caches()
@@ -266,6 +269,9 @@ def _init_tokenizer(self, **tokenizer_init_kwargs):
     def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)
         self.cache_config.verify_with_parallel_config(self.parallel_config)
+        if self.tensorizer_config:
+            self.tensorizer_config.verify_with_parallel_config(
+                self.parallel_config)
         if self.lora_config:
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index f20221a0b941a..30577ecf62faa 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -2,7 +2,7 @@
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VisionLanguageConfig)
+                         TensorizerConfig, VisionLanguageConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -15,17 +15,14 @@
 
 class GPUExecutor(ExecutorBase):
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
-        speculative_config: Optional[SpeculativeConfig],
-    ) -> None:
+    def __init__(self, model_config: ModelConfig, cache_config: CacheConfig,
+                 parallel_config: ParallelConfig,
+                 scheduler_config: SchedulerConfig,
+                 device_config: DeviceConfig,
+                 lora_config: Optional[LoRAConfig],
+                 vision_language_config: Optional[VisionLanguageConfig],
+                 speculative_config: Optional[SpeculativeConfig],
+                 tensorizer_config: Optional[TensorizerConfig]) -> None:
         self.model_config = model_config
         self.cache_config = cache_config
         self.lora_config = lora_config
@@ -33,6 +30,7 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.vision_language_config = vision_language_config
+        self.tensorizer_config = tensorizer_config
 
         assert (not speculative_config
                 ), "Speculative decoding not yet supported for GPU backend"
@@ -61,6 +59,7 @@ def _init_worker(self):
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
             vision_language_config=self.vision_language_config,
+            tensorizer_config=self.tensorizer_config,
             is_driver_worker=True,
         )
         self.driver_worker.init_device()
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index b937693c92257..28dc3e0db312a 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -7,7 +7,7 @@
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VisionLanguageConfig)
+                         TensorizerConfig, VisionLanguageConfig)
 from vllm.engine.ray_utils import RayWorkerVllm, ray
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
@@ -42,6 +42,7 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         vision_language_config: Optional[VisionLanguageConfig],
         speculative_config: Optional[SpeculativeConfig],
+        tensorizer_config: Optional[TensorizerConfig],
     ) -> None:
         self.model_config = model_config
         self.cache_config = cache_config
@@ -50,6 +51,7 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.vision_language_config = vision_language_config
+        self.tensorizer_config = tensorizer_config
         assert (not speculative_config
                 ), "Speculative decoding not yet supported for RayGPU backend."
 
@@ -171,6 +173,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     distributed_init_method=distributed_init_method,
                     lora_config=lora_config,
                     vision_language_config=vision_language_config,
+                    tensorizer_config=self.tensorizer_config,
                 ))
 
         # Initialize the driver worker with the Worker class.
@@ -187,6 +190,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
             vision_language_config=self.vision_language_config,
+            tensorizer_config=self.tensorizer_config,
             is_driver_worker=True,
         )
 
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index b95a37a6c1c63..016689cefe850 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -3,11 +3,14 @@
 from typing import Tuple, Type
 
 import torch
-import torch.nn as nn
+from torch import nn
 
 from vllm.config import DeviceConfig, ModelConfig
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.llava import LlavaForConditionalGeneration
+from vllm.model_executor.tensorizer_loader import (
+    ParameterizedLoadFormat, is_vllm_serialized_tensorizer,
+    load_with_tensorizer)
 from vllm.model_executor.weight_utils import (get_quant_config,
                                               get_sparse_config,
                                               initialize_dummy_weights)
@@ -52,6 +55,7 @@ def get_model(model_config: ModelConfig, device_config: DeviceConfig,
               **kwargs) -> nn.Module:
     lora_config = kwargs.get("lora_config", None)
     vision_language_config = kwargs.get("vision_language_config", None)
+    tensorizer_config = kwargs.get("tensorizer_config", None)
     model_class = _get_model_architecture(model_config)[0]
 
     # Get the (maybe sparse or quantized) linear method.
@@ -72,6 +76,7 @@ def get_model(model_config: ModelConfig, device_config: DeviceConfig,
                 f"{model_config.dtype} is not supported for quantization "
                 f"method {model_config.quantization}. Supported dtypes: "
                 f"{supported_dtypes}")
+
         linear_method = quant_config.get_linear_method()
     # UPSTREAM SYNC: needed to support sparsity
     if model_config.sparsity is not None:
@@ -95,28 +100,48 @@ def get_model(model_config: ModelConfig, device_config: DeviceConfig,
     with _set_default_torch_dtype(model_config.dtype):
         # Create a model instance.
         # The weights will be initialized as empty tensors.
+        extra_kwargs = {}
+        if hasattr(model_class, "supported_lora_modules"):
+            extra_kwargs["lora_config"] = lora_config
+        elif lora_config:
+            raise ValueError(
+                f"Model {model_class.__name__} does not support LoRA, "
+                "but LoRA is enabled. Support for this model may "
+                "be added in the future. If this is important to you, "
+                "please open an issue on github.")
+        elif model_class in _VISION_MODEL_CLASSES:
+            extra_kwargs["vision_language_config"] = vision_language_config
+
         with torch.device(device_config.device):
-            if hasattr(model_class, "supported_lora_modules"):
-                model = model_class(model_config.hf_config, linear_method,
-                                    lora_config)
-            elif lora_config:
-                raise ValueError(
-                    f"Model {model_class.__name__} does not support LoRA, "
-                    "but LoRA is enabled. Support for this model may "
-                    "be added in the future. If this is important to you, "
-                    "please open an issue on github.")
-            else:
-                if model_class not in _VISION_MODEL_CLASSES:
-                    model = model_class(model_config.hf_config, linear_method)
-                else:
-                    model = model_class(model_config.hf_config,
-                                        vision_language_config, linear_method)
+            if (model_config.load_format == "tensorizer"
+                    and is_vllm_serialized_tensorizer(tensorizer_config)):
+                extra_kwargs["linear_method"] = linear_method
+                tensorizer_config.model_class = model_class
+                tensorizer_config.hf_config = model_config.hf_config
+                tensorizer_config.dtype = model_config.dtype
+                model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
+                return model.eval()
+            model = model_class(config=model_config.hf_config,
+                                linear_method=linear_method,
+                                **extra_kwargs)
         if model_config.load_format == "dummy":
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
             initialize_dummy_weights(model)
         else:
             # Load the weights from the cached or downloaded files.
-            model.load_weights(model_config.model, model_config.download_dir,
-                               model_config.load_format, model_config.revision)
+            if model_config.load_format == "tensorizer":
+                # Provide a dynamic load format for `model.load_weights`
+                # to retain tensorizer args from CLI.
+                model_config.load_format = ParameterizedLoadFormat(
+                    model_config.load_format)
+                model_config.load_format.params = (
+                    tensorizer_config._construct_tensorizer_args())
+
+            model.load_weights(
+                model_config.model,
+                model_config.download_dir,
+                model_config.load_format,
+                model_config.revision,
+            )
     return model.eval()
diff --git a/vllm/model_executor/tensorizer_loader.py b/vllm/model_executor/tensorizer_loader.py
new file mode 100644
index 0000000000000..ed3ad9e2ffa15
--- /dev/null
+++ b/vllm/model_executor/tensorizer_loader.py
@@ -0,0 +1,319 @@
+import argparse
+import dataclasses
+import io
+import os
+import time
+import typing
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from vllm.config import TensorizerConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+
+tensorizer_load_fail = False
+
+try:
+    from tensorizer import (DecryptionParams, EncryptionParams,
+                            TensorDeserializer, TensorSerializer)
+    from tensorizer.stream_io import open_stream
+    from tensorizer.utils import (convert_bytes, get_mem_usage,
+                                  no_init_or_tensor)
+except ImportError:
+    tensorizer_load_fail = True
+
+__all__ = [
+    'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
+    'TensorSerializer', 'open_stream', 'convert_bytes', 'get_mem_usage',
+    'no_init_or_tensor'
+]
+
+logger = init_logger(__name__)
+
+
+def load_with_tensorizer(tensorizer_config: TensorizerConfig,
+                         **extra_kwargs) -> nn.Module:
+    tensorizer = TensorizerAgent(tensorizer_config, **extra_kwargs)
+    return tensorizer.deserialize()
+
+
+def tensorizer_warning(message: str):
+    return warnings.warn(message, category=PerformanceWarning, stacklevel=2)
+
+
+def is_vllm_serialized_tensorizer(tensorizer_config: TensorizerConfig) -> bool:
+    if tensorizer_config is None:
+        return False
+    return tensorizer_config.vllm_tensorized
+
+
+class ParameterizedLoadFormat(str):
+    __slots__ = "params"
+
+
+class PerformanceWarning(UserWarning):
+
+    def __str__(self):
+        return (f"{super().__str__()}"
+                " (set the VLLM_SILENCE_PERFORMANCE_WARNINGS"
+                " environment variable to hide this)")
+
+
+if (os.getenv("VLLM_SILENCE_PERFORMANCE_WARNINGS", "").lower()
+        not in ("", "0", "n", "no", "off", "disable")):
+    warnings.simplefilter("ignore", category=PerformanceWarning)
+
+
+@dataclass
+class TensorizerArgs:
+    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
+                          str, bytes, os.PathLike, int]
+    vllm_tensorized: bool
+    verify_hash: Optional[bool] = False
+    num_readers: Optional[int] = 1
+    encryption_keyfile: Optional[str] = None
+    s3_access_key_id: Optional[str] = None
+    s3_secret_access_key: Optional[str] = None
+    s3_endpoint: Optional[str] = None
+    """
+  Args for the TensorizerAgent class. These are used to configure the behavior 
+  of the TensorDeserializer when loading tensors from a serialized model.
+  
+  Args:
+      tensorizer_uri: Path to serialized model tensors. Can be a local file 
+          path or a S3 URI.
+      vllm_tensorized: If True, indicates that the serialized model is a 
+          vLLM model. This is used to determine the behavior of the 
+          TensorDeserializer when loading tensors from a serialized model.
+          It is far faster to deserialize a vLLM model as it utilizes
+          tensorizer's optimized GPU loading.
+      verify_hash: If True, the hashes of each tensor will be verified against 
+          the hashes stored in the metadata. A `HashMismatchError` will be 
+          raised if any of the hashes do not match.
+      num_readers: Controls how many threads are allowed to read concurrently
+          from the source file. Default is 1. This greatly increases
+          performance.
+      encryption_keyfile: File path to a binary file containing a  
+          binary key to use for decryption. `None` (the default) means 
+          no decryption. See the example script in 
+          examples/tensorize_vllm_model.py. 
+      s3_access_key_id: The access key for the S3 bucket. Can also be set via
+          the S3_ACCESS_KEY_ID environment variable.
+      s3_secret_access_key: The secret access key for the S3 bucket. Can also
+          be set via the S3_SECRET_ACCESS_KEY environment variable.
+      s3_endpoint: The endpoint for the S3 bucket. Can also be set via the
+          S3_ENDPOINT_URL environment variable.
+  """
+
+    def __post_init__(self):
+        self.file_obj = self.tensorizer_uri
+        self.s3_access_key_id = (self.s3_access_key_id
+                                 or os.environ.get("S3_ACCESS_KEY_ID")) or None
+        self.s3_secret_access_key = (
+            self.s3_secret_access_key
+            or os.environ.get("S3_SECRET_ACCESS_KEY")) or None
+        self.s3_endpoint = (self.s3_endpoint
+                            or os.environ.get("S3_ENDPOINT_URL")) or None
+        self.stream_params = {
+            "s3_access_key_id": self.s3_access_key_id,
+            "s3_secret_access_key": self.s3_secret_access_key,
+            "s3_endpoint": self.s3_endpoint,
+        }
+
+        # Omitting self.dtype and self.device as this behaves weirdly
+        self.deserializer_params = {
+            "verify_hash": self.verify_hash,
+            "encryption": self.encryption_keyfile,
+            "num_readers": self.num_readers
+        }
+        if self.encryption_keyfile:
+            with open_stream(
+                    self.encryption_keyfile,
+                    **self.stream_params,
+            ) as stream:
+                key = stream.read()
+                decryption_params = DecryptionParams.from_key(key)
+                self.deserializer_params['encryption'] = decryption_params
+
+    def add_cli_args(
+            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        """Tensorizer CLI arguments"""
+
+        # Create the argument group
+        group = parser.add_argument_group(
+            'tensorizer options',
+            description=('Options for configuring the behavior of the'
+                         ' tensorizer deserializer when '
+                         '--load-format=tensorizer'))
+
+        group.add_argument(
+            "--tensorizer-uri",
+            help="Path to serialized model tensors. Can be a local file path,"
+            " or an HTTP(S) or S3 URI.",
+        )
+        group.add_argument(
+            "--verify-hash",
+            action="store_true",
+            help="If enabled, the hashes of each tensor will be verified"
+            " against the hashes stored in the file metadata. An exception"
+            " will be raised if any of the hashes do not match.",
+        )
+        group.add_argument(
+            "--encryption-keyfile",
+            default=None,
+            help="The file path to a binary file containing a binary key to "
+            "use for decryption. Can be a file path or S3 network URI.")
+        group.add_argument(
+            "--num-readers",
+            default=1,
+            type=int,
+            help="Controls how many threads are allowed to read concurrently "
+            "from the source file.")
+        group.add_argument(
+            "--s3-access-key-id",
+            default=None,
+            help="The access key for the S3 bucket. Can also be set via the "
+            "S3_ACCESS_KEY_ID environment variable.",
+        )
+        group.add_argument(
+            "--s3-secret-access-key",
+            default=None,
+            help="The secret access key for the S3 bucket. Can also be set via "
+            "the S3_SECRET_ACCESS_KEY environment variable.",
+        )
+        group.add_argument(
+            "--s3-endpoint",
+            default=None,
+            help="The endpoint for the S3 bucket. Can also be set via the "
+            "S3_ENDPOINT_URL environment variable.",
+        )
+        group.add_argument(
+            "--vllm-tensorized",
+            action="store_true",
+            help="If enabled, indicates that the serialized model is a vLLM "
+            "model. This is used to determine the behavior of the "
+            "TensorDeserializer when loading tensors from a "
+            "serialized model.")
+
+        return parser
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
+        # Get the list of attributes of this dataclass.
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        # Set the attributes from the parsed arguments.
+        tensorizer_args = cls(**{
+            attr: getattr(args, attr)
+            for attr in attrs if hasattr(args, attr)
+        })
+        return tensorizer_args
+
+
+class TensorizerAgent:
+    """
+    A class for performing tensorizer deserializations specifically for
+    vLLM models using plaid_mode. Uses TensorizerArgs to configure the
+    behavior of the TensorDeserializer when loading tensors from a serialized
+    model. For deserializations of HuggingFace models, TensorDeserializer is
+    instead used as an iterator directly in the func hf_model_weights_iterator
+    in vllm/model_executor/weight_utils.py
+    """
+
+    def __init__(self, tensorizer_config: TensorizerConfig,
+                 linear_method: LinearMethodBase, **extra_kwargs):
+        self.tensorizer_config = tensorizer_config
+        self.tensorizer_args = (
+            self.tensorizer_config._construct_tensorizer_args())
+        self.extra_kwargs = extra_kwargs
+        if extra_kwargs.get("linear_method", None) is not None:
+            self.linear_method = extra_kwargs["linear_method"]
+        else:
+            self.linear_method = linear_method
+        self.model = self._init_model()
+
+        if tensorizer_load_fail:
+            raise ImportError(
+                "Tensorizer is not installed. Please install tensorizer "
+                "to use this feature with `pip install vllm[tensorizer]`.")
+
+    def _init_model(self):
+        model_args = self.tensorizer_config.hf_config
+        model_args.torch_dtype = self.tensorizer_config.dtype
+        with no_init_or_tensor():
+            return self.tensorizer_config.model_class(
+                config=model_args,
+                linear_method=self.linear_method,
+                **self.extra_kwargs)
+
+    def _resize_lora_embeddings(self):
+        """Modify LoRA embedding layers to use bigger tensors
+        to allow for adapter added tokens."""
+        for child in self.model.modules():
+            if (isinstance(child, VocabParallelEmbedding)
+                    and child.weight.shape[0] <
+                    child.num_embeddings_per_partition):
+                new_weight = torch.empty(child.num_embeddings_per_partition,
+                                         child.embedding_dim,
+                                         dtype=child.weight.dtype,
+                                         device=child.weight.device)
+                new_weight[:child.weight.shape[0]].copy_(child.weight.data)
+                new_weight[child.weight.shape[0]:].fill_(0)
+                child.weight.data = new_weight
+
+    def _check_tensors_on_meta_device(self):
+        for tensor in self.model.state_dict().values():
+            if tensor.device.type == 'meta':
+                raise ValueError(
+                    "The serialized model contains tensors on the meta device,"
+                    " indicating that some tensors were not loaded properly."
+                    " Please check that the parameters of the model being"
+                    " specified match that of the serialized model, such as"
+                    " its quantization.")
+
+    def deserialize(self):
+        """
+        Deserialize the model using the TensorDeserializer. This method is
+        specifically for vLLM models using tensorizer's plaid_mode.
+
+        The deserializer makes use of tensorizer_args.stream_params
+        to configure the behavior of the stream when loading tensors from a
+        serialized model. The deserializer_params are used to configure the
+        behavior of the TensorDeserializer when loading tensors themselves.
+        Documentation on these params can be found in TensorizerArgs
+
+        Returns:
+            nn.Module: The deserialized model.
+        """
+        before_mem = get_mem_usage()
+        # Lazy load the tensors from S3 into the model.
+        start = time.perf_counter()
+        with open_stream(
+                self.tensorizer_args.tensorizer_uri,
+                mode="rb",
+                **self.tensorizer_args.stream_params,
+        ) as stream, TensorDeserializer(
+                stream,
+                dtype=self.tensorizer_config.dtype,
+                **self.tensorizer_args.deserializer_params) as deserializer:
+            deserializer.load_into_module(self.model)
+            end = time.perf_counter()
+
+        total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+        duration = end - start
+        per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+        after_mem = get_mem_usage()
+        deserializer.close()
+        logger.info(f"Deserialized {total_bytes_str} in "
+                    f"{end - start:0.2f}s, {per_second}/s")
+        logger.info(f"Memory usage before: {before_mem}")
+        logger.info(f"Memory usage after: {after_mem}")
+
+        self._check_tensors_on_meta_device()
+        self._resize_lora_embeddings()
+        return self.model.eval()
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index c77ed29446276..6226f9032a98b 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -5,7 +5,7 @@
 import json
 import os
 from collections import defaultdict
-from typing import Any, Iterable, Iterator, List, Optional, Tuple
+from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
 
 import filelock
 import huggingface_hub.constants
@@ -176,7 +176,8 @@ def prepare_hf_model_weights(
     revision: Optional[str] = None,
 ) -> Tuple[str, List[str], bool]:
     # Download model weights from huggingface.
-    is_local = os.path.isdir(model_name_or_path)
+    is_local = os.path.isdir(model_name_or_path) \
+               and load_format != "tensorizer"
     use_safetensors = False
     # Some quantized models use .pt files for storing the weights.
     if load_format == "auto":
@@ -188,13 +189,15 @@ def prepare_hf_model_weights(
         allow_patterns = ["*.pt"]
     elif load_format == "npcache":
         allow_patterns = ["*.bin"]
+    elif load_format == "tensorizer":
+        allow_patterns = ["*.tensors"]
     else:
         raise ValueError(f"Unknown load_format: {load_format}")
 
     if fall_back_to_pt:
         allow_patterns += ["*.pt"]
 
-    if not is_local:
+    if not is_local and load_format != "tensorizer":
         # Before we download we look at that is available:
         fs = HfFileSystem()
         file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
@@ -239,6 +242,9 @@ def prepare_hf_model_weights(
             if not any(f.endswith(x) for x in blacklist)
         ]
 
+    if load_format == "tensorizer":
+        return hf_folder, hf_weights_files, use_safetensors
+
     if len(hf_weights_files) == 0:
         raise RuntimeError(
             f"Cannot find any model weights with `{model_name_or_path}`")
@@ -249,7 +255,7 @@ def prepare_hf_model_weights(
 def hf_model_weights_iterator(
     model_name_or_path: str,
     cache_dir: Optional[str] = None,
-    load_format: str = "auto",
+    load_format: Union[Tuple, str] = "auto",
     revision: Optional[str] = None,
     fall_back_to_pt: Optional[bool] = True,
 ) -> Iterator[Tuple[str, torch.Tensor]]:
@@ -292,6 +298,26 @@ def hf_model_weights_iterator(
             with open(param_path, "rb") as f:
                 param = np.load(f)
             yield name, torch.from_numpy(param)
+    elif load_format == "tensorizer":
+        from vllm.model_executor.tensorizer_loader import (TensorDeserializer,
+                                                           open_stream,
+                                                           tensorizer_warning)
+        tensorizer_args = load_format.params
+        tensorizer_warning(
+            "Deserializing HuggingFace models is not optimized for "
+            "loading on vLLM, as tensorizer is forced to load to CPU. "
+            "Consider deserializing a vLLM model instead for faster "
+            "load times. See the examples/tensorize_vllm_model.py example "
+            "script for serializing vLLM models.")
+
+        deserializer_args = tensorizer_args.deserializer_params
+        stream_params = tensorizer_args.stream_params
+        stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
+        with TensorDeserializer(stream, **deserializer_args,
+                                device="cpu") as state:
+            for name, param in state.items():
+                yield name, param
+        del state
     elif use_safetensors:
         for st_file in hf_weights_files:
             with safe_open(st_file, framework="pt") as f:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0051d6a3fb309..12ef80ab60f20 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -12,7 +12,8 @@
 from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
                             get_attn_backend)
 from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VisionLanguageConfig)
+                         SchedulerConfig, TensorizerConfig,
+                         VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict, with_pynccl_for_all_reduce
 from vllm.distributed.device_communicators import (custom_all_reduce,
                                                    pynccl_utils)
@@ -113,11 +114,13 @@ def __init__(
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         vision_language_config: Optional[VisionLanguageConfig] = None,
+        tensorizer_config: Optional[TensorizerConfig] = None,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.lora_config = lora_config
+        self.tensorizer_config = tensorizer_config
         self.is_driver_worker = is_driver_worker
 
         # model_config can be None in tests/samplers/test_sampler.py.
@@ -160,7 +163,9 @@ def load_model(self) -> None:
                 lora_config=self.lora_config,
                 vision_language_config=self.vision_language_config,
                 parallel_config=self.parallel_config,
-                scheduler_config=self.scheduler_config)
+                scheduler_config=self.scheduler_config,
+                tensorizer_config=self.tensorizer_config,
+            )
 
         self.model_memory_usage = m.consumed_memory
         logger.info(f"Loading model weights took "
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 3f0b2fd83f3e5..82491c6df6616 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -7,7 +7,8 @@
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+                         ParallelConfig, SchedulerConfig, TensorizerConfig,
+                         VisionLanguageConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
                               init_distributed_environment)
@@ -42,6 +43,7 @@ def __init__(
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
         vision_language_config: Optional[VisionLanguageConfig] = None,
+        tensorizer_config: Optional[TensorizerConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         self.model_config = model_config
@@ -53,6 +55,7 @@ def __init__(
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
+        self.tensorizer_config = tensorizer_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -70,7 +73,9 @@ def __init__(
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
-            vision_language_config=vision_language_config)
+            vision_language_config=vision_language_config,
+            tensorizer_config=tensorizer_config,
+        )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine = None

From f39e0b5ef0ece196e3f200e4d3bcb2e49ce5d639 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 13 Apr 2024 23:40:21 -0700
Subject: [PATCH 015/106] [Core] avoid too many cuda context by caching p2p
 test (#4021)

---
 .../device_communicators/custom_all_reduce.py | 53 +++++------
 vllm/distributed/parallel_state.py            |  9 ++
 vllm/distributed/utils.py                     | 87 ++++++++++++++++++-
 3 files changed, 116 insertions(+), 33 deletions(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 84238d2e46076..f83caef879da3 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -42,12 +42,17 @@ def init_custom_ar() -> None:
             " disable_custom_all_reduce=True explicitly.", world_size,
             str(_SUPPORTED_WORLD_SIZES))
         return
-    if not _can_p2p(rank, world_size):
+    num_dev = torch.cuda.device_count()
+    # note: num dev can be larger than world_size if we're only using
+    # first few GPUs
+    if num_dev < world_size:
         logger.warn(
-            "Custom allreduce is disabled because your platform lacks GPU P2P"
-            " capability or P2P test failed. To silence this warning, specify"
-            " disable_custom_all_reduce=True explicitly.")
-        return
+            "Cannot test GPU P2P because not all GPUs are visible to the "
+            "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
+            " is set.")
+        return False
+    # test nvlink first, this will filter out most of the cases
+    # where custom allreduce is not supported
     full_nvlink = _is_full_nvlink(rank, world_size)
     if world_size > 2 and not full_nvlink:
         logger.warn(
@@ -55,6 +60,15 @@ def init_custom_ar() -> None:
             " than two PCIe-only GPUs. To silence this warning, specify"
             " disable_custom_all_reduce=True explicitly.")
         return
+    # test P2P capability
+    # this is expensive to compute at the first time
+    # then we cache the result
+    if not _can_p2p(rank, world_size):
+        logger.warn(
+            "Custom allreduce is disabled because your platform lacks GPU P2P"
+            " capability or P2P test failed. To silence this warning, specify"
+            " disable_custom_all_reduce=True explicitly.")
+        return
     _CA_HANDLE = CustomAllreduce(rank, world_size, full_nvlink)
 
 
@@ -143,40 +157,15 @@ def _is_full_nvlink(rank, world_size):
 
 
 def _can_p2p(rank: int, world_size: int) -> bool:
-    num_dev = torch.cuda.device_count()
-    # note: num dev can be larger than world_size if we're only using
-    # first few GPUs
-    if num_dev < world_size:
-        logger.warn(
-            "Cannot test GPU P2P because not all GPUs are visible to the "
-            "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
-            " is set.")
-        return False
+    from vllm.distributed.utils import gpu_p2p_access_check
     for i in range(world_size):
         if i == rank:
             continue
-        if not torch.cuda.can_device_access_peer(rank, i):
-            return False
-        # on some platforms, P2P support might be buggy and we need
-        # additional checks. See also:
-        # https://github.com/vllm-project/vllm/issues/2728
-        if not _can_actually_p2p(rank, i):
+        if not gpu_p2p_access_check(rank, i):
             return False
     return True
 
 
-# code partly borrowed from
-# https://github.com/turboderp/exllamav2/blob/1c67f97f3d2a968605a9c31ab791a05c85bb7879/exllamav2/compat.py#L10
-# License: MIT
-def _can_actually_p2p(idx_a, idx_b):
-    dev_i = f"cuda:{idx_a}"
-    dev_j = f"cuda:{idx_b}"
-    a = torch.randn(5, device=dev_i) + 123.0
-    b = a.to(dev_j)
-    c = b.to(dev_i)
-    return torch.all(a == c)
-
-
 class CustomAllreduce:
 
     # max_size: max supported allreduce size
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 1258bf58cb453..e2473736375e0 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -41,6 +41,13 @@
 # source rank when broadcasting from the first or last pipeline stage.
 _PIPELINE_GLOBAL_RANKS = None
 
+_LOCAL_RANK = -1
+
+
+def get_local_rank():
+    global _LOCAL_RANK
+    return _LOCAL_RANK
+
 
 def init_distributed_environment(
     world_size: int = -1,
@@ -66,6 +73,8 @@ def init_distributed_environment(
         ranks = list(range(torch.distributed.get_world_size()))
         _CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks,
                                                        backend="gloo")
+        global _LOCAL_RANK
+        _LOCAL_RANK = local_rank
 
 
 def initialize_model_parallel(
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 0cd420c8e11b5..e0a871ebe1756 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -2,9 +2,18 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-from typing import Sequence
+import json
+import os
+from typing import Dict, Optional, Sequence
 
 import torch
+import torch.distributed as dist
+
+from vllm.logger import init_logger
+
+from .parallel_state import get_cpu_world_group, get_local_rank
+
+logger = init_logger(__name__)
 
 
 def ensure_divisibility(numerator, denominator):
@@ -46,3 +55,79 @@ def split_tensor_along_last_dim(
         return tuple(chunk.contiguous() for chunk in tensor_list)
 
     return tensor_list
+
+
+# code partly borrowed from
+# https://github.com/turboderp/exllamav2/blob/1c67f97f3d2a968605a9c31ab791a05c85bb7879/exllamav2/compat.py#L10
+# License: MIT
+def _can_actually_p2p(idx_a, idx_b):
+    dev_i = f"cuda:{idx_a}"
+    dev_j = f"cuda:{idx_b}"
+    a = torch.randn(5, device=dev_i) + 123.0
+    b = a.to(dev_j)
+    c = b.to(dev_i)
+    return torch.all(a == c).cpu().item()
+
+
+# why do we need this cache?
+# 1. we can have runtime checks for P2P access, where every process checks
+#  P2P access to all other GPUs. Unfortunately, the test might cost many
+#  (world_size * world_size) cuda context, and reduce the memory available
+#  for the model. see https://github.com/vllm-project/vllm/issues/3821
+# 2. alternatively, we can have a p2p map that is generated by the master
+#  process and broadcasted to all other processes. This still requires
+#  #world_size of cuda context, belonging to the master process, on each GPU.
+# 3. we can have a cache file, that records the p2p access status. The first
+#  time the master process checks the p2p access, it will generate the cache
+#  file, at the cost of #world_size of cuda context. Later on, all processes
+#  can read the cache file to check the p2p access status without any cost of
+#  additional cuda context.
+# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
+#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
+#  e.g. used by different vllm engines. The device id in the cache file is a
+#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
+#  of visible devices in the vllm engine.
+_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
+
+
+def gpu_p2p_access_check(i: int, j: int) -> bool:
+    """Check if GPU i can access GPU j."""
+
+    # if the cache variable is already calculated,
+    # read from the cache instead of checking it again
+    global _gpu_p2p_access_cache
+    if _gpu_p2p_access_cache is not None:
+        return _gpu_p2p_access_cache[f"{i}->{j}"]
+
+    is_distributed = dist.is_initialized()
+
+    num_dev = torch.cuda.device_count()
+    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    if cuda_visible_devices is None:
+        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+    path = os.path.expanduser(
+        f"~/.config/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    if (not is_distributed or get_local_rank() == 0) \
+        and (not os.path.exists(path)):
+        # only the local master process (with local_rank == 0) can
+        #  enter this block to calculate the cache
+        logger.info(f"generating GPU P2P access cache for in {path}")
+        cache = {}
+        for _i in range(num_dev):
+            for _j in range(num_dev):
+                # on some platforms, P2P support might be buggy and we need
+                # additional checks. See also:
+                # https://github.com/vllm-project/vllm/issues/2728
+                cache[f"{_i}->{_j}"] = torch.cuda.can_device_access_peer(
+                    _i, _j) and _can_actually_p2p(_i, _j)
+        with open(path, "w") as f:
+            json.dump(cache, f, indent=4)
+    if is_distributed:
+        cpu_world_group = get_cpu_world_group()
+        dist.barrier(cpu_world_group)
+    logger.info(f"reading GPU P2P access cache from {path}")
+    with open(path, "r") as f:
+        cache = json.load(f)
+    _gpu_p2p_access_cache = cache
+    return _gpu_p2p_access_cache[f"{i}->{j}"]

From de26ef7789938bdc8a9edd098c4b53a2ec643575 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sun, 14 Apr 2024 22:12:42 +0100
Subject: [PATCH 016/106] [BugFix] Fix tensorizer extra in setup.py (#4072)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 5d9a646b11b0b..008272e77906c 100644
--- a/setup.py
+++ b/setup.py
@@ -429,7 +429,7 @@ def get_extra_requirements() -> dict:
     extras_require=get_extra_requirements(),
     ext_modules=ext_modules,
     extras_require={
-        "optional": ["tensorizer==2.9.0a1"],
+        "tensorizer": ["tensorizer==2.9.0a1"],
     },
     cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
     package_data=package_data,

From d3f28b18ae3e3db003352b5d14e6825c8631e794 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 14 Apr 2024 14:35:55 -0700
Subject: [PATCH 017/106] [Docs] document that mixtral 8x22b is supported
 (#4073)

---
 docs/source/models/supported_models.rst | 36 ++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index c09b0ff250437..5e5ce871f61dd 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -30,23 +30,23 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`CohereForCausalLM`
     - Command-R
     - :code:`CohereForAI/c4ai-command-r-v01`, etc.
-    - 
+    -
   * - :code:`DbrxForCausalLM`
     - DBRX
     - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc.
-    - 
+    -
   * - :code:`DeciLMForCausalLM`
     - DeciLM
     - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
-    - 
+    -
   * - :code:`BloomForCausalLM`
     - BLOOM, BLOOMZ, BLOOMChat
     - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
-    - 
+    -
   * - :code:`FalconForCausalLM`
     - Falcon
     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
-    - 
+    -
   * - :code:`GemmaForCausalLM`
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
@@ -54,19 +54,19 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
-    - 
+    -
   * - :code:`GPTBigCodeForCausalLM`
     - StarCoder, SantaCoder, WizardCoder
     - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
-    - 
+    -
   * - :code:`GPTJForCausalLM`
     - GPT-J
     - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
-    - 
+    -
   * - :code:`GPTNeoXForCausalLM`
     - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
     - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
-    - 
+    -
   * - :code:`InternLMForCausalLM`
     - InternLM
     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
@@ -93,32 +93,32 @@ Alongside each architecture, we include some popular models that use it.
     - ✅︎
   * - :code:`MixtralForCausalLM`
     - Mixtral-8x7B, Mixtral-8x7B-Instruct
-    - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.
+    - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc.
     - ✅︎
   * - :code:`MPTForCausalLM`
     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
-    - 
+    -
   * - :code:`OLMoForCausalLM`
     - OLMo
     - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
-    - 
+    -
   * - :code:`OPTForCausalLM`
     - OPT, OPT-IML
     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
-    - 
+    -
   * - :code:`OrionForCausalLM`
     - Orion
     - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
-    - 
+    -
   * - :code:`PhiForCausalLM`
     - Phi
     - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
-    - 
+    -
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
-    - 
+    -
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
     - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
@@ -126,11 +126,11 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`Qwen2MoeForCausalLM`
     - Qwen2MoE
     - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-    - 
+    -
   * - :code:`StableLmForCausalLM`
     - StableLM
     - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
-    - 
+    -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.

From 0012b9b3573aed1f188cb27014a85873969c4936 Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Mon, 15 Apr 2024 08:43:54 +0800
Subject: [PATCH 018/106] [Misc] Upgrade triton to 2.2.0 (#4061)

---
 requirements-cpu.txt  | 2 +-
 requirements-cuda.txt | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 5779b38b24e69..e911ad03295f0 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,4 +3,4 @@
 
 # Dependencies for x86_64 CPUs
 torch == 2.2.1+cpu
-triton >= 2.1.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 6ee75e8139c04..c6d2cd46aee54 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -7,4 +7,3 @@ pynvml == 11.5.0
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.2.1
 xformers == 0.0.25  # Requires PyTorch 2.2.1
-triton >= 2.1.0

From 6bd8ad15a9fa0ec1abee76911284e18df97475e5 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Sun, 14 Apr 2024 21:50:08 -0700
Subject: [PATCH 019/106] [Bugfix] Fix filelock version requirement (#4075)

---
 requirements-common.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index c96f9c9937fb0..90a3bc8abc1db 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -12,4 +12,5 @@ pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 tiktoken == 0.6.0  # Required for DBRX tokenizer
 outlines == 0.0.34 # Requires torch >= 2.1.0
-typing_extensions
\ No newline at end of file
+typing_extensions
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4

From 5c33590e2d2df55f7ad46c5ee36195f7499f2514 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Mon, 15 Apr 2024 23:35:55 +0800
Subject: [PATCH 020/106] [Misc][Minor] Fix CPU block num log in CPUExecutor.
 (#4088)

---
 vllm/executor/cpu_executor.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 33e67d8b3eec2..e63a88be7868f 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -74,7 +74,10 @@ def initialize_cache(self, num_gpu_blocks: int,
         # NOTE: We log here to avoid multiple logs when number of workers is
         # greater than one. We could log in the engine, but not all executors
         # have GPUs.
-        logger.info(f"# CPU blocks: {num_cpu_blocks}")
+        # NOTE: `cpu block` for CPU backend is located on CPU memory but is
+        # referred as `gpu block`. Because we want to reuse the existing block
+        # management procedure.
+        logger.info(f"# CPU blocks: {num_gpu_blocks}")
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
     def execute_model(self,

From 3d2820734b19c18ea6faf3366eb1fa7361192f8f Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Mon, 15 Apr 2024 13:05:09 -0700
Subject: [PATCH 021/106] [Core] Simplifications to executor classes (#4071)

---
 vllm/executor/cpu_executor.py     | 31 +++++++++-------------------
 vllm/executor/executor_base.py    | 27 +++++++++++++++++-------
 vllm/executor/gpu_executor.py     | 32 ++++-------------------------
 vllm/executor/neuron_executor.py  | 29 ++++++--------------------
 vllm/executor/ray_gpu_executor.py | 34 ++++---------------------------
 5 files changed, 44 insertions(+), 109 deletions(-)

diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index e63a88be7868f..f562e4e0ae3de 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -1,10 +1,9 @@
 import os
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Set, Tuple
 
 import torch
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -16,23 +15,13 @@
 
 class CPUExecutor(ExecutorBase):
 
-    def __init__(self, model_config: ModelConfig, cache_config: CacheConfig,
-                 parallel_config: ParallelConfig,
-                 scheduler_config: SchedulerConfig,
-                 device_config: DeviceConfig,
-                 lora_config: Optional[LoRAConfig], *args, **kwargs) -> None:
-        assert device_config.device_type == "cpu"
-        assert lora_config is None, "cpu backend doesn't support LoRA"
-        model_config = _verify_and_get_model_config(model_config)
-        cache_config = _verify_and_get_cache_config(cache_config)
-        scheduler_config = _verify_and_get_scheduler_config(scheduler_config)
-
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
+    def _init_executor(self) -> None:
+        assert self.device_config.device_type == "cpu"
+        assert self.lora_config is None, "cpu backend doesn't support LoRA"
+        self.model_config = _verify_and_get_model_config(self.model_config)
+        self.cache_config = _verify_and_get_cache_config(self.cache_config)
+        self.scheduler_config = _verify_and_get_scheduler_config(
+            self.scheduler_config)
 
         # Instantiate the worker and load the model to CPU.
         self._init_worker()
@@ -99,7 +88,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.driver_worker.remove_lora(lora_id)
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
     def check_health(self) -> None:
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index bbfbfc689c99f..bbb6ec80f7b7e 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,9 +1,9 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VisionLanguageConfig)
+                         TensorizerConfig, VisionLanguageConfig)
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 
@@ -16,7 +16,6 @@ class ExecutorBase(ABC):
     that can execute the model on multiple devices.
     """
 
-    @abstractmethod
     def __init__(
         self,
         model_config: ModelConfig,
@@ -27,8 +26,23 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         vision_language_config: Optional[VisionLanguageConfig],
         speculative_config: Optional[SpeculativeConfig],
+        tensorizer_config: Optional[TensorizerConfig],
     ) -> None:
-        raise NotImplementedError
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.vision_language_config = vision_language_config
+        self.speculative_config = speculative_config
+        self.tensorizer_config = tensorizer_config
+
+        self._init_executor()
+
+    @abstractmethod
+    def _init_executor(self) -> None:
+        pass
 
     @abstractmethod
     def determine_num_available_blocks(self) -> Tuple[int, int]:
@@ -71,7 +85,7 @@ def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         raise NotImplementedError
 
     @abstractmethod
@@ -94,8 +108,7 @@ async def execute_model_async(
         """Executes one model step on the given sequences."""
         raise NotImplementedError
 
-    @abstractmethod
     async def check_health_async(self) -> None:
         """Checks if the executor is healthy. If not, it should raise an
         exception."""
-        raise NotImplementedError
+        self.check_health()
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 30577ecf62faa..bae509f48025b 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -1,8 +1,5 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Set, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         TensorizerConfig, VisionLanguageConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -15,24 +12,8 @@
 
 class GPUExecutor(ExecutorBase):
 
-    def __init__(self, model_config: ModelConfig, cache_config: CacheConfig,
-                 parallel_config: ParallelConfig,
-                 scheduler_config: SchedulerConfig,
-                 device_config: DeviceConfig,
-                 lora_config: Optional[LoRAConfig],
-                 vision_language_config: Optional[VisionLanguageConfig],
-                 speculative_config: Optional[SpeculativeConfig],
-                 tensorizer_config: Optional[TensorizerConfig]) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.vision_language_config = vision_language_config
-        self.tensorizer_config = tensorizer_config
-
-        assert (not speculative_config
+    def _init_executor(self) -> None:
+        assert (not self.speculative_config
                 ), "Speculative decoding not yet supported for GPU backend"
 
         # Instantiate the worker and load the model to GPU.
@@ -103,7 +84,7 @@ def remove_lora(self, lora_id: int) -> bool:
         assert lora_id > 0, "lora_id must be greater than 0."
         return self.driver_worker.remove_lora(lora_id)
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
     def check_health(self) -> None:
@@ -127,8 +108,3 @@ async def execute_model_async(
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy)
         return output
-
-    async def check_health_async(self) -> None:
-        # GPUExecutor will always be healthy as long as
-        # it's running.
-        return
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index d45f18e466256..273b17a927efd 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -1,8 +1,5 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Set, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VisionLanguageConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -13,24 +10,10 @@
 
 class NeuronExecutor(ExecutorBase):
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
-        speculative_config: Optional[SpeculativeConfig],
-    ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        assert lora_config is None, "LoRA is not supported for Neuron backend."
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        assert (not speculative_config
+    def _init_executor(self) -> None:
+        assert (self.lora_config is
+                None), "LoRA is not supported for Neuron backend."
+        assert (not self.speculative_config
                 ), "Speculative decoding not yet supported for Neuron backend."
 
         # Instantiate the worker and load the model to the device.
@@ -80,7 +63,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.driver_worker.remove_lora(lora_id)
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         return self.driver_worker.list_loras()
 
     def check_health(self) -> None:
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 28dc3e0db312a..5db2f3f652532 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -3,11 +3,8 @@
 import os
 import pickle
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         TensorizerConfig, VisionLanguageConfig)
 from vllm.engine.ray_utils import RayWorkerVllm, ray
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
@@ -32,27 +29,8 @@
 
 class RayGPUExecutor(ExecutorBase):
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        tensorizer_config: Optional[TensorizerConfig],
-    ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.vision_language_config = vision_language_config
-        self.tensorizer_config = tensorizer_config
-        assert (not speculative_config
+    def _init_executor(self) -> None:
+        assert (not self.speculative_config
                 ), "Speculative decoding not yet supported for RayGPU backend."
 
         assert self.parallel_config.worker_use_ray
@@ -273,7 +251,7 @@ def remove_lora(self, lora_id: int) -> bool:
             lora_id=lora_id,
         )
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         return self._run_workers("list_loras")
 
     def _run_workers(
@@ -416,7 +394,3 @@ async def execute_model_async(
         # Only the driver worker returns the sampling results.
         output = all_outputs[0]
         return output
-
-    async def check_health_async(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        self._check_if_any_actor_is_dead()

From 0008bf9be4fd9aca78b326599fd0358f5feb965b Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Mon, 15 Apr 2024 16:28:25 -0400
Subject: [PATCH 022/106] [Doc] Add better clarity for tensorizer usage (#4090)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 docs/source/models/engine_args.rst       |  2 +-
 examples/tensorize_vllm_model.py         | 60 +++++++++++++++++-------
 vllm/model_executor/tensorizer_loader.py |  6 +--
 3 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index 886a806934c04..235cb4e128c99 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -45,7 +45,7 @@ Below, you can find an explanation of every engine argument for vLLM:
     * "safetensors" will load the weights in the safetensors format.
     * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
     * "dummy" will initialize the weights with random values, mainly for profiling.
-    * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_. See `tensorized_vllm_model.py` in the examples folder to serialize a vLLM model, and for more information. Tensorizer support for vLLM can be installed with `pip install vllm[tensorizer]`.
+    * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_ See `examples/tensorize_vllm_model.py <https://github.com/vllm-project/vllm/blob/main/examples/tensorize_vllm_model.py>`_ to serialize a vLLM model, and for more information.
 
 .. option:: --dtype {auto,half,float16,bfloat16,float,float32}
 
diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index 3c20a38c7f726..8cf8be09d0b9c 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -23,14 +23,16 @@
 # yapf: disable
 """
 tensorize_vllm_model.py is a script that can be used to serialize and 
-deserialize vLLM models. These models can be loaded using tensorizer directly 
-to the GPU extremely quickly. Tensor encryption and decryption is also 
-supported, although libsodium must be installed to use it. Install
-vllm with tensorizer support using `pip install vllm[tensorizer]`.
+deserialize vLLM models. These models can be loaded using tensorizer 
+to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
+or locally. Tensor encryption and decryption is also supported, although 
+libsodium must be installed to use it. Install vllm with tensorizer support 
+using `pip install vllm[tensorizer]`.
 
-To serialize a model, you can run something like this:
+To serialize a model, install vLLM from source, then run something 
+like this from the root level of this repository:
 
-python tensorize_vllm_model.py \
+python -m examples.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    serialize \
@@ -38,31 +40,57 @@
    --suffix vllm
    
 Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
-and saves it to your S3 bucket. A local directory can also be used.
+and saves it to your S3 bucket. A local directory can also be used. This
+assumes your S3 credentials are specified as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
+To provide S3 credentials directly, you can provide `--s3-access-key-id` and 
+`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this 
+script.
 
 You can also encrypt the model weights with a randomly-generated key by 
 providing a `--keyfile` argument.
 
-To deserialize a model, you can run something like this:
+To deserialize a model, you can run something like this from the root 
+level of this repository:
 
-python tensorize_vllm_model.py \
+python -m examples.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
    --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
 
 Which downloads the model tensors from your S3 bucket and deserializes them.
-To provide S3 credentials, you can provide `--s3-access-key-id` and 
-`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
-the OpenAI entrypoint, as arguments for LLM(), or as environment variables
-in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
-
 
 You can also provide a `--keyfile` argument to decrypt the model weights if 
 they were serialized with encryption.
 
-For more information on the available arguments, run 
-`python tensorize_vllm_model.py --help`.
+For more information on the available arguments for serializing, run 
+`python -m examples.tensorize_vllm_model serialize --help`.
+
+Or for deserializing:
+
+`python -m examples.tensorize_vllm_model deserialize --help`.
+
+Once a model is serialized, it can be used to load the model when running the
+OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing
+the `--tensorizer-uri` CLI argument that is functionally the same as the
+`--path-to-tensors` argument in this script, along with `--vllm-tensorized`, to
+signify that the model to be deserialized is a vLLM model, rather than a 
+HuggingFace `PreTrainedModel`, which can also be deserialized using tensorizer
+in the same inference server, albeit without the speed optimizations. To
+deserialize an encrypted file, the `--encryption-keyfile` argument can be used
+to provide the path to the keyfile used to encrypt the model weights. For
+information on all the arguments that can be used to configure tensorizer's
+deserialization, check out the tensorizer options argument group in the
+`vllm/entrypoints/openai/api_server.py` script with `--help`.
+
+Tensorizer can also be invoked with the `LLM` class directly to load models:
+
+    llm = LLM(model="facebook/opt-125m",
+              load_format="tensorizer",
+              tensorizer_uri=path_to_opt_tensors,
+              num_readers=3,
+              vllm_tensorized=True)
 """
 
 
diff --git a/vllm/model_executor/tensorizer_loader.py b/vllm/model_executor/tensorizer_loader.py
index ed3ad9e2ffa15..8550cc97aefe8 100644
--- a/vllm/model_executor/tensorizer_loader.py
+++ b/vllm/model_executor/tensorizer_loader.py
@@ -126,7 +126,6 @@ def __post_init__(self):
             "s3_endpoint": self.s3_endpoint,
         }
 
-        # Omitting self.dtype and self.device as this behaves weirdly
         self.deserializer_params = {
             "verify_hash": self.verify_hash,
             "encryption": self.encryption_keyfile,
@@ -145,7 +144,7 @@ def add_cli_args(
             parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         """Tensorizer CLI arguments"""
 
-        # Create the argument group
+        # Tensorizer options arg group
         group = parser.add_argument_group(
             'tensorizer options',
             description=('Options for configuring the behavior of the'
@@ -205,9 +204,7 @@ def add_cli_args(
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
-        # Get the list of attributes of this dataclass.
         attrs = [attr.name for attr in dataclasses.fields(cls)]
-        # Set the attributes from the parsed arguments.
         tensorizer_args = cls(**{
             attr: getattr(args, attr)
             for attr in attrs if hasattr(args, attr)
@@ -291,7 +288,6 @@ def deserialize(self):
             nn.Module: The deserialized model.
         """
         before_mem = get_mem_usage()
-        # Lazy load the tensors from S3 into the model.
         start = time.perf_counter()
         with open_stream(
                 self.tensorizer_args.tensorizer_uri,

From 6800f958d9c3184143eb00929dee5daa41ad71f1 Mon Sep 17 00:00:00 2001
From: Ricky Xu <xuchen727@hotmail.com>
Date: Mon, 15 Apr 2024 14:24:45 -0700
Subject: [PATCH 023/106] [Bugfix] Fix ray workers profiling with nsight 
 (#4095)

---
 vllm/executor/ray_gpu_executor.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 5db2f3f652532..7aca5e36107aa 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -48,6 +48,21 @@ def _init_executor(self) -> None:
         if USE_RAY_COMPILED_DAG:
             self.forward_dag = self._compiled_ray_dag()
 
+    def _configure_ray_workers_use_nsight(self,
+                                          ray_remote_kwargs) -> Dict[str, Any]:
+        # If nsight profiling is enabled, we need to set the profiling
+        # configuration for the ray workers as runtime env.
+        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
+        runtime_env.update({
+            "nsight": {
+                "t": "cuda,cudnn,cublas",
+                "o": "'worker_process_%p'",
+                "cuda-graph-trace": "node",
+            }
+        })
+
+        return ray_remote_kwargs
+
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
         if self.parallel_config.tensor_parallel_size == 1:
@@ -63,6 +78,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         # The remaining workers are the actual ray actors.
         self.workers: List[RayWorkerVllm] = []
 
+        if self.parallel_config.ray_workers_use_nsight:
+            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                ray_remote_kwargs)
+
         # Create the workers.
         driver_ip = get_ip()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):

From 43af0d05bc2ae8e68493b9e5b2ba45aad5770abd Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Tue, 16 Apr 2024 06:47:31 +0900
Subject: [PATCH 024/106] [Typing] Fix Sequence type GenericAlias only
 available after Python 3.9. (#4092)

---
 vllm/core/block_manager_v1.py | 5 +++--
 vllm/core/block_manager_v2.py | 2 +-
 vllm/core/interfaces.py       | 2 +-
 vllm/utils.py                 | 7 ++++---
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index e391a3b1e5a33..be093922b84f2 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -1,9 +1,10 @@
 """A block manager that manages token blocks."""
 from abc import ABC, abstractmethod
-from collections.abc import Sequence as GenericSequence
 from itertools import count, takewhile
 from os.path import commonprefix
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Set
 
 from vllm.block import BlockTable, PhysicalTokenBlock
 from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 19f0cf415eb34..6339a6baf4161 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -1,6 +1,6 @@
 """A block manager that manages token blocks."""
-from collections.abc import Sequence as GenericSequence
 from typing import Dict, List, Optional
+from typing import Sequence as GenericSequence
 
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index c1f68a2e891bf..56c2c5995c38b 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,7 +1,7 @@
 import enum
 from abc import ABC, abstractmethod
-from collections.abc import Sequence as GenericSequence
 from typing import Dict, List
+from typing import Sequence as GenericSequence
 
 from vllm.sequence import Sequence, SequenceGroup
 
diff --git a/vllm/utils.py b/vllm/utils.py
index f4aaf0cc51d7a..9e0ec6eab9383 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -6,11 +6,12 @@
 import subprocess
 import uuid
 import warnings
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 from functools import lru_cache, partial
 from platform import uname
 from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
-                    Hashable, List, Optional, Tuple, TypeVar, Union)
+                    Hashable, List, Optional, OrderedDict, Tuple, TypeVar,
+                    Union)
 
 import psutil
 import torch
@@ -51,7 +52,7 @@ def reset(self) -> None:
 class LRUCache(Generic[T]):
 
     def __init__(self, capacity: int):
-        self.cache = OrderedDict[Hashable, T]()
+        self.cache: OrderedDict[Hashable, T] = OrderedDict()
         self.capacity = capacity
 
     def __contains__(self, key: Hashable) -> bool:

From f0456124d0b08893324a917a16ae2bc270f881c9 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Tue, 16 Apr 2024 14:24:53 +0900
Subject: [PATCH 025/106] [Core] Fix engine-use-ray broken  (#4105)

---
 tests/async_engine/test_api_server.py | 17 +++++++++++++----
 vllm/engine/async_llm_engine.py       |  7 +++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 248bfbc8ab5c0..7f57d5cf9b182 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -25,21 +25,30 @@ def _query_server_long(prompt: str) -> dict:
 
 
 @pytest.fixture
-def api_server(tokenizer_pool_size: int):
+def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+               worker_use_ray: bool):
     script_path = Path(__file__).parent.joinpath(
         "api_server_async_engine.py").absolute()
-    uvicorn_process = subprocess.Popen([
+    commands = [
         sys.executable, "-u",
         str(script_path), "--model", "facebook/opt-125m", "--host",
         "127.0.0.1", "--tokenizer-pool-size",
         str(tokenizer_pool_size)
-    ])
+    ]
+    if engine_use_ray:
+        commands.append("--engine-use-ray")
+    if worker_use_ray:
+        commands.append("--worker-use-ray")
+    uvicorn_process = subprocess.Popen(commands)
     yield
     uvicorn_process.terminate()
 
 
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
-def test_api_server(api_server, tokenizer_pool_size: int):
+@pytest.mark.parametrize("worker_use_ray", [False, True])
+@pytest.mark.parametrize("engine_use_ray", [False, True])
+def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
+                    engine_use_ray: bool):
     """
     Run the API server and test it.
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index f610495135121..1dbf58904541c 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -333,8 +333,7 @@ def from_engine_args(
         if engine_config.device_config.device_type == "neuron":
             raise NotImplementedError("Neuron is not supported for "
                                       "async engine yet.")
-        elif (engine_config.parallel_config.worker_use_ray
-              or engine_args.engine_use_ray):
+        elif engine_config.parallel_config.worker_use_ray:
             initialize_ray_cluster(engine_config.parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
             executor_class = RayGPUExecutorAsync
@@ -410,8 +409,8 @@ def _init_engine(self, *args,
         else:
             # FIXME(woosuk): This is a bit hacky. Be careful when changing the
             # order of the arguments.
-            cache_config = args[1]
-            parallel_config = args[2]
+            cache_config = kwargs["cache_config"]
+            parallel_config = kwargs["parallel_config"]
             if parallel_config.tensor_parallel_size == 1:
                 num_gpus = cache_config.gpu_memory_utilization
             else:

From bc92515f2f7ff77bf7b040804985d97d18d92bfe Mon Sep 17 00:00:00 2001
From: Noam Gat <noamgat@gmail.com>
Date: Tue, 16 Apr 2024 08:54:57 +0300
Subject: [PATCH 026/106] LM Format Enforcer Guided Decoding Support (#3868)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 requirements-common.txt                       |   1 +
 tests/entrypoints/test_guided_processors.py   |  42 +++++++-
 tests/entrypoints/test_openai_server.py       |  69 ++++++++----
 vllm/config.py                                |  26 ++++-
 vllm/engine/arg_utils.py                      |  18 +++-
 vllm/engine/llm_engine.py                     |  10 +-
 vllm/entrypoints/openai/protocol.py           |  12 +++
 vllm/entrypoints/openai/serving_chat.py       |   6 +-
 vllm/entrypoints/openai/serving_completion.py |   6 +-
 .../guided_decoding/__init__.py               |  25 +++++
 .../lm_format_enforcer_decoding.py            |  69 ++++++++++++
 .../outlines_decoding.py}                     |   7 +-
 .../outlines_logits_processors.py}            | 100 +++++++++---------
 13 files changed, 304 insertions(+), 87 deletions(-)
 create mode 100644 vllm/model_executor/guided_decoding/__init__.py
 create mode 100644 vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
 rename vllm/model_executor/{guided_decoding.py => guided_decoding/outlines_decoding.py} (93%)
 rename vllm/model_executor/{guided_logits_processors.py => guided_decoding/outlines_logits_processors.py} (70%)

diff --git a/requirements-common.txt b/requirements-common.txt
index 90a3bc8abc1db..c1614d2537b25 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -11,6 +11,7 @@ uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 tiktoken == 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.9.3
 outlines == 0.0.34 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
index 5622744566bcc..30f0ad5d8272f 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -1,11 +1,14 @@
 # This unit test should be moved to a new
 # tests/test_guided_decoding directory.
-
+import pytest
 import torch
 from transformers import AutoTokenizer
 
-from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
-                                                          RegexLogitsProcessor)
+from vllm.entrypoints.openai.protocol import CompletionRequest
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+    JSONLogitsProcessor, RegexLogitsProcessor)
 
 TEST_SCHEMA = {
     "type": "object",
@@ -73,3 +76,36 @@ def test_guided_logits_processors():
     json_LP(token_ids, tensor)
     assert tensor.shape == original_tensor.shape
     assert not torch.allclose(tensor, original_tensor)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
+async def test_guided_logits_processor_black_box(backend: str):
+    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+    token_ids = tokenizer.encode(
+        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+    regex_request = CompletionRequest(model='test',
+                                      prompt=token_ids,
+                                      guided_regex=TEST_REGEX)
+    regex_lp = await get_guided_decoding_logits_processor(
+        backend, regex_request, tokenizer)
+    assert regex_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = regex_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+    token_ids = tokenizer.encode(
+        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+    json_request = CompletionRequest(model='test',
+                                     prompt=token_ids,
+                                     guided_json=TEST_SCHEMA)
+    json_lp = await get_guided_decoding_logits_processor(
+        backend, json_request, tokenizer)
+    assert json_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 047b2abbe28ec..bdb80056f3961 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -506,7 +506,10 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
     assert first_response != completion.choices[0].text
 
 
-async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example JSON for an employee profile "
@@ -514,7 +517,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
         n=3,
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_json=TEST_SCHEMA))
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 3
@@ -524,7 +528,10 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
         jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
 
 
-async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
+                                guided_decoding_backend: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -538,8 +545,9 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
-        max_tokens=500,
-        extra_body=dict(guided_json=TEST_SCHEMA))
+        max_tokens=1000,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
@@ -555,8 +563,9 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
-        max_tokens=500,
-        extra_body=dict(guided_json=TEST_SCHEMA))
+        max_tokens=1000,
+        extra_body=dict(guided_json=TEST_SCHEMA,
+                        guided_decoding_backend=guided_decoding_backend))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
@@ -565,14 +574,18 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
     assert json1["age"] != json2["age"]
 
 
-async def test_guided_regex_completion(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
         n=3,
         temperature=1.0,
         max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX))
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 3
@@ -581,7 +594,10 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI):
         assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
 
 
-async def test_guided_regex_chat(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
+                                 guided_decoding_backend: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -595,7 +611,8 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI):
         model=MODEL_NAME,
         messages=messages,
         max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX))
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
     assert re.fullmatch(TEST_REGEX, ip1) is not None
@@ -606,21 +623,26 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI):
         model=MODEL_NAME,
         messages=messages,
         max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX))
+        extra_body=dict(guided_regex=TEST_REGEX,
+                        guided_decoding_backend=guided_decoding_backend))
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
     assert re.fullmatch(TEST_REGEX, ip2) is not None
     assert ip1 != ip2
 
 
-async def test_guided_choice_completion(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt="The best language for type-safe systems programming is ",
         n=2,
         temperature=1.0,
         max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE))
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 2
@@ -628,7 +650,10 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI):
         assert completion.choices[i].text in TEST_CHOICE
 
 
-async def test_guided_choice_chat(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
+                                  guided_decoding_backend: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -642,7 +667,8 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI):
         model=MODEL_NAME,
         messages=messages,
         max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE))
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
     choice1 = chat_completion.choices[0].message.content
     assert choice1 in TEST_CHOICE
 
@@ -655,18 +681,23 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI):
         model=MODEL_NAME,
         messages=messages,
         max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE))
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
     choice2 = chat_completion.choices[0].message.content
     assert choice2 in TEST_CHOICE
     assert choice1 != choice2
 
 
-async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str):
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
             prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42))
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
 
     messages = [{
         "role": "system",
diff --git a/vllm/config.py b/vllm/config.py
index a626933713708..d14ea20b54581 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -66,8 +66,8 @@ class ModelConfig:
             weights. If None, we assume the model weights are not quantized.
         quantization_param_path: Path to JSON file containing scaling factors.
             Used to load KV cache scaling factors into the model when KV cache
-            type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also 
-            be used to load activation and weight scaling factors when the 
+            type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
+            be used to load activation and weight scaling factors when the
             model dtype is FP8_E4M3 on ROCm.
         enforce_eager: Whether to enforce eager execution. If True, we will
             disable CUDA graph and always execute the model in eager mode.
@@ -454,7 +454,7 @@ def verify_with_parallel_config(
 @dataclass
 class TokenizerPoolConfig:
     """Configuration for the tokenizer pool.
-    
+
     Args:
         pool_size: Number of tokenizer workers in the pool.
         pool_type: Type of the pool.
@@ -478,9 +478,9 @@ def create_config(
         tokenizer_pool_extra_config: Optional[Union[str, dict]]
     ) -> Optional["TokenizerPoolConfig"]:
         """Create a TokenizerPoolConfig from the given parameters.
-        
+
         If tokenizer_pool_size is 0, return None.
-        
+
         Args:
             tokenizer_pool_size: Number of tokenizer workers in the pool.
             tokenizer_pool_type: Type of the pool.
@@ -1119,6 +1119,21 @@ def _get_and_verify_max_len(
     return int(max_model_len)
 
 
+@dataclass
+class DecodingConfig:
+    """Dataclass which contains the decoding strategy of the engine"""
+
+    # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
+    guided_decoding_backend: str = 'outlines'
+
+    def __post_init__(self):
+        valid_guided_backends = ['outlines', 'lm-format-enforcer']
+        backend = self.guided_decoding_backend
+        if backend not in valid_guided_backends:
+            raise ValueError(f"Invalid guided_decoding_backend '{backend},"
+                             f"must be one of {valid_guided_backends}")
+
+
 @dataclass(frozen=True)
 class EngineConfig:
     """Dataclass which contains all engine-related configuration. This
@@ -1133,6 +1148,7 @@ class EngineConfig:
     lora_config: Optional[LoRAConfig]
     vision_language_config: Optional[VisionLanguageConfig]
     speculative_config: Optional[SpeculativeConfig]
+    decoding_config: Optional[DecodingConfig]
     tensorizer_config: Optional[TensorizerConfig]
 
     def __post_init__(self):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 90453adabf379..16333bf380ae8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,9 +7,9 @@
 from dataclasses import dataclass
 from typing import BinaryIO, Optional, Union
 
-from vllm.config import (CacheConfig, DeviceConfig, EngineConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, TensorizerConfig,
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
+                         EngineConfig, LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig, TensorizerConfig,
                          TokenizerPoolConfig, VisionLanguageConfig)
 from vllm.model_executor.tensorizer_loader import TensorizerArgs
 from vllm.utils import str_to_int_tuple
@@ -84,6 +84,7 @@ class EngineArgs:
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
 
+    guided_decoding_backend: str = 'outlines'
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
     num_speculative_tokens: Optional[int] = None
@@ -204,6 +205,13 @@ def add_cli_args(
                             default=EngineArgs.max_model_len,
                             help='model context length. If unspecified, '
                             'will be automatically derived from the model.')
+        parser.add_argument(
+            '--guided-decoding-backend',
+            type=str,
+            default='outlines',
+            choices=['outlines', 'lm-format-enforcer'],
+            help='Which engine will be used for guided decoding'
+            ' (JSON schema / regex etc)')
         # Parallel arguments
         parser.add_argument('--worker-use-ray',
                             action='store_true',
@@ -539,6 +547,9 @@ def create_engine_config(self, ) -> EngineConfig:
         else:
             vision_language_config = None
 
+        decoding_config = DecodingConfig(
+            guided_decoding_backend=self.guided_decoding_backend)
+
         return EngineConfig(model_config=model_config,
                             cache_config=cache_config,
                             parallel_config=parallel_config,
@@ -547,6 +558,7 @@ def create_engine_config(self, ) -> EngineConfig:
                             lora_config=lora_config,
                             vision_language_config=vision_language_config,
                             speculative_config=speculative_config,
+                            decoding_config=decoding_config,
                             tensorizer_config=tensorizer_config)
 
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index c02234f7fe3f5..2d30e1fe0d64a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -4,9 +4,10 @@
 from transformers import PreTrainedTokenizer
 
 import vllm
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         TensorizerConfig, VisionLanguageConfig)
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, TensorizerConfig,
+                         VisionLanguageConfig)
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics import StatLogger, Stats
@@ -74,6 +75,7 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         vision_language_config: Optional[VisionLanguageConfig],
         speculative_config: Optional[SpeculativeConfig],
+        decoding_config: Optional[DecodingConfig],
         tensorizer_config: Optional[TensorizerConfig],
         executor_class: Type[ExecutorBase],
         log_stats: bool,
@@ -102,6 +104,7 @@ def __init__(
             f"kv_cache_dtype={cache_config.cache_dtype}, "
             f"quantization_param_path={model_config.quantization_param_path}, "
             f"device_config={device_config.device}, "
+            f"decoding_config={decoding_config!r}, "
             f"seed={model_config.seed})")
         # TODO(woosuk): Print more configs in debug mode.
 
@@ -113,6 +116,7 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.speculative_config = speculative_config
+        self.decoding_config = decoding_config or DecodingConfig()
         self.tensorizer_config = tensorizer_config
         self.log_stats = log_stats
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f94d22d279cc4..cf779d44c816b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -133,6 +133,12 @@ class ChatCompletionRequest(BaseModel):
         description=(
             "If specified, the output will follow the context free grammar."),
     )
+    guided_decoding_backend: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default guided decoding backend "
+            "of the server for this specific request. If set, must be either "
+            "'outlines' / 'lm-format-enforcer'"))
 
     # doc: end-chat-completion-extra-params
 
@@ -265,6 +271,12 @@ class CompletionRequest(BaseModel):
         description=(
             "If specified, the output will follow the context free grammar."),
     )
+    guided_decoding_backend: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default guided decoding backend "
+            "of the server for this specific request. If set, must be one of "
+            "'outlines' / 'lm-format-enforcer'"))
 
     # doc: end-completion-extra-params
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a03c5dc88108f..c9ed4a9de20f4 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -68,9 +68,13 @@ async def create_chat_completion(
                 request, prompt=prompt)
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
+            decoding_config = self.engine.engine.decoding_config
+            guided_decoding_backend = request.guided_decoding_backend \
+                or decoding_config.guided_decoding_backend
             guided_decode_logits_processor = (
                 await get_guided_decoding_logits_processor(
-                    request, await self.engine.get_tokenizer()))
+                    guided_decoding_backend, request, await
+                    self.engine.get_tokenizer()))
             if guided_decode_logits_processor:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index e24aa2489a80f..a71f2d6a4426a 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -88,9 +88,13 @@ async def create_completion(self, request: CompletionRequest,
         try:
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
+            decoding_config = self.engine.engine.decoding_config
+            guided_decoding_backend = request.guided_decoding_backend \
+                or decoding_config.guided_decoding_backend
             guided_decode_logit_processor = (
                 await get_guided_decoding_logits_processor(
-                    request, await self.engine.get_tokenizer()))
+                    guided_decoding_backend, request, await
+                    self.engine.get_tokenizer()))
             if guided_decode_logit_processor is not None:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
new file mode 100644
index 0000000000000..0558d6c95d97b
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -0,0 +1,25 @@
+from typing import Optional, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              CompletionRequest)
+from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (
+    get_lm_format_enforcer_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.outlines_decoding import (
+    get_outlines_guided_decoding_logits_processor)
+from vllm.sampling_params import LogitsProcessor
+
+
+async def get_guided_decoding_logits_processor(
+        guided_decoding_backend: str, request: Union[CompletionRequest,
+                                                     ChatCompletionRequest],
+        tokenizer) -> Optional[LogitsProcessor]:
+    if guided_decoding_backend == 'outlines':
+        return await get_outlines_guided_decoding_logits_processor(
+            request, tokenizer)
+    if guided_decoding_backend == 'lm-format-enforcer':
+        return await get_lm_format_enforcer_guided_decoding_logits_processor(
+            request, tokenizer)
+
+    raise ValueError(
+        f"Unknown guided decoding backend '{guided_decoding_backend}'. "
+        "Must be one of 'outlines, 'lm-format-enforcer'")
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
new file mode 100644
index 0000000000000..0d74a5f8e81ff
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -0,0 +1,69 @@
+from functools import lru_cache
+from json import loads as json_loads
+from typing import Optional, Union
+
+from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
+                              RegexParser, StringParser,
+                              TokenEnforcerTokenizerData, UnionParser)
+from lmformatenforcer.integrations.vllm import (
+    build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
+from pydantic import BaseModel
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              CompletionRequest)
+from vllm.model_executor.guided_decoding.outlines_decoding import (
+    get_outlines_guided_decoding_logits_processor)
+from vllm.sampling_params import LogitsProcessor
+
+
+async def get_lm_format_enforcer_guided_decoding_logits_processor(
+        request: Union[CompletionRequest, ChatCompletionRequest],
+        tokenizer) -> Optional[LogitsProcessor]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+
+    tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer)
+    character_level_parser: CharacterLevelParser
+    if request.guided_json:
+        schema = _normalize_json_schema_object(request.guided_json)
+        character_level_parser = JsonSchemaParser(schema)
+    elif request.guided_choice:
+        character_level_parser = UnionParser(
+            [StringParser(choice) for choice in request.guided_choice])
+    elif request.guided_regex:
+        character_level_parser = RegexParser(request.guided_regex)
+    elif request.guided_grammar:
+        # CFG grammar not supported by LMFE, revert to outlines
+        return await get_outlines_guided_decoding_logits_processor(
+            request, tokenizer)
+    elif (request.response_format is not None
+          and request.response_format.type == "json_object"):
+        character_level_parser = JsonSchemaParser(
+            None)  # None means any json object
+    else:
+        return None
+
+    logits_processor = build_vllm_logits_processor(tokenizer_data,
+                                                   character_level_parser)
+    return logits_processor
+
+
+def _normalize_json_schema_object(schema: Union[str, dict, BaseModel]) -> dict:
+    if isinstance(schema, str):
+        return json_loads(schema)
+    if isinstance(schema, dict):
+        return schema
+    if isinstance(schema, BaseModel):
+        return schema.model_json_schema()
+
+
+@lru_cache
+def _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
+    return build_vllm_token_enforcer_tokenizer_data(tokenizer)
diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
similarity index 93%
rename from vllm/model_executor/guided_decoding.py
rename to vllm/model_executor/guided_decoding/outlines_decoding.py
index 8e710f1ac2b53..bd4564a36e1ed 100644
--- a/vllm/model_executor/guided_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -12,9 +12,8 @@
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest)
-from vllm.model_executor.guided_logits_processors import (CFGLogitsProcessor,
-                                                          JSONLogitsProcessor,
-                                                          RegexLogitsProcessor)
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+    CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
 
 
 class GuidedDecodingMode(Enum):
@@ -54,7 +53,7 @@ class GuidedDecodingMode(Enum):
 global_thread_pool = None  # used for generating logits processor fsm
 
 
-async def get_guided_decoding_logits_processor(
+async def get_outlines_guided_decoding_logits_processor(
         request: Union[CompletionRequest, ChatCompletionRequest],
         tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]:
     """
diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
similarity index 70%
rename from vllm/model_executor/guided_logits_processors.py
rename to vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 035fe00037328..28041695546dc 100644
--- a/vllm/model_executor/guided_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -13,9 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import json
 import math
 from collections import defaultdict
+from functools import lru_cache
 from typing import Callable, DefaultDict, Dict, List, Optional, Union
 
 import torch
@@ -27,50 +29,6 @@
 
 class BaseLogitsProcessor:
 
-    def adapt_tokenizer(self, tokenizer: PreTrainedTokenizerBase):
-        """Adapt vLLM's tokenizer to use to compile the FSM.
-
-        The API of Outlines tokenizers is slightly different to that of
-        `transformers`. The decoder of outlines, returns a list whereas
-        the decode of vLLM returns an str. To sync the vLLM decoder with
-        outlines internal api, the decoder should be adapted. In addition
-        we need to handle the missing spaces to Llama's tokenizer to be
-        able to compile FSMs for this model.
-
-        """
-        if getattr(tokenizer, "_outlines_adapted", False):
-            return tokenizer
-
-        tokenizer.vocabulary = tokenizer.get_vocab()
-        tokenizer.special_tokens = set(tokenizer.all_special_tokens)
-
-        def convert_token_to_string(token: str) -> str:
-            from transformers.file_utils import SPIECE_UNDERLINE
-
-            string = tokenizer.convert_tokens_to_string([token])
-
-            # A hack to handle missing spaces to HF's Llama tokenizers
-            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
-                return " " + string
-
-            return string
-
-        def change_decoder(
-            decoder: Callable[[List[int]], str]
-        ) -> Callable[[List[int]], List[str]]:
-            """Sync vLLM's decoder with the outlines by returning list."""
-
-            def new_decoder(inp_tokens: List[int]) -> List[str]:
-                return [decoder(inp_tokens)]
-
-            return new_decoder
-
-        tokenizer.convert_token_to_string = convert_token_to_string
-        tokenizer.decode = change_decoder(tokenizer.decode)
-        setattr(tokenizer, "_outlines_adapted", True)  # noqa: B010
-
-        return tokenizer
-
     def init_state(self):
         """Initialize the FSM states."""
         self.fsm_state: DefaultDict[int, int] = defaultdict(int)
@@ -78,7 +36,6 @@ def init_state(self):
     def __call__(self, input_ids: List[int],
                  scores: torch.Tensor) -> torch.Tensor:
         """Use the FSM to bias the logits before sampling the next token."""
-
         seq_id = hash(tuple(input_ids))
 
         if len(input_ids) == 0:
@@ -96,7 +53,6 @@ def __call__(self, input_ids: List[int],
                           device=scores.device)
         mask[allowed_tokens] = 0
         scores.add_(mask)
-
         return scores
 
 
@@ -113,7 +69,7 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
             The model's tokenizer
 
         """
-        tokenizer = self.adapt_tokenizer(tokenizer)
+        tokenizer = _adapt_tokenizer(tokenizer)
         fsm = RegexFSM(regex_string, tokenizer)
         self.fsm = fsm
 
@@ -167,6 +123,54 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
             The model's tokenizer
 
         """
-        tokenizer = self.adapt_tokenizer(tokenizer)
+        tokenizer = _adapt_tokenizer(tokenizer)
         fsm = CFGFSM(cfg, tokenizer)
         self.fsm = fsm
+
+
+@lru_cache
+def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
+    """Adapt vLLM's tokenizer to use to compile the FSM.
+
+    The API of Outlines tokenizers is slightly different to that of
+    `transformers`. The decoder of outlines, returns a list whereas
+    the decode of vLLM returns an str. To sync the vLLM decoder with
+    outlines internal api, the decoder should be adapted. In addition
+    we need to handle the missing spaces to Llama's tokenizer to be
+    able to compile FSMs for this model.
+
+    """
+    if getattr(tokenizer, "_outlines_adapted", False):
+        return tokenizer
+
+    tokenizer = copy.deepcopy(tokenizer)
+
+    tokenizer.vocabulary = tokenizer.get_vocab()
+    tokenizer.special_tokens = set(tokenizer.all_special_tokens)
+
+    def convert_token_to_string(token: str) -> str:
+        from transformers.file_utils import SPIECE_UNDERLINE
+
+        string = tokenizer.convert_tokens_to_string([token])
+
+        # A hack to handle missing spaces to HF's Llama tokenizers
+        if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+            return " " + string
+
+        return string
+
+    def change_decoder(
+        decoder: Callable[[List[int]],
+                          str]) -> Callable[[List[int]], List[str]]:
+        """Sync vLLM's decoder with the outlines by returning list."""
+
+        def new_decoder(inp_tokens: List[int]) -> List[str]:
+            return [decoder(inp_tokens)]
+
+        return new_decoder
+
+    tokenizer.convert_token_to_string = convert_token_to_string
+    tokenizer.decode = change_decoder(tokenizer.decode)
+    setattr(tokenizer, "_outlines_adapted", True)  # noqa: B010
+
+    return tokenizer

From 2986e805ec5883c41463b706ab6f8c5cb19fca55 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 16 Apr 2024 11:34:39 -0700
Subject: [PATCH 027/106] [Core] Refactor model loading code (#4097)

---
 .buildkite/test-pipeline.yaml                 |   2 +-
 examples/fp8/extract_scales.py                |   4 +-
 examples/tensorize_vllm_model.py              |   2 +-
 tests/lora/conftest.py                        |  10 +-
 tests/lora/test_worker.py                     |  10 +-
 tests/model_executor/weight_utils.py          |   2 +-
 .../test_autogptq_marlin_configs.py           |   4 -
 tests/samplers/test_sampler.py                |  15 +-
 tests/spec_decode/utils.py                    |   1 +
 .../__init__.py                               |   0
 .../tensorize_vllm_model_for_testing.py       |   4 +-
 .../test_tensorizer.py                        | 139 ++++---
 tests/test_config.py                          |   4 -
 tests/test_logits_processor.py                |   7 +-
 tests/worker/test_model_runner.py             |  39 +-
 tests/worker/test_swap.py                     |   1 +
 vllm/config.py                                | 201 ++++------
 vllm/engine/arg_utils.py                      |  71 ++--
 vllm/engine/llm_engine.py                     |  19 +-
 vllm/executor/cpu_executor.py                 |   1 +
 vllm/executor/executor_base.py                |  10 +-
 vllm/executor/gpu_executor.py                 |   2 +-
 vllm/executor/ray_gpu_executor.py             |   5 +-
 vllm/model_executor/model_loader/__init__.py  |  30 ++
 vllm/model_executor/model_loader/loader.py    | 354 ++++++++++++++++++
 .../neuron.py}                                |   0
 .../tensorizer.py}                            | 116 ++++--
 vllm/model_executor/model_loader/utils.py     |  40 ++
 .../{ => model_loader}/weight_utils.py        | 295 +++++++--------
 vllm/model_executor/models/baichuan.py        |  14 +-
 vllm/model_executor/models/bloom.py           |  14 +-
 vllm/model_executor/models/chatglm.py         |  14 +-
 vllm/model_executor/models/commandr.py        |  16 +-
 vllm/model_executor/models/dbrx.py            |  16 +-
 vllm/model_executor/models/decilm.py          |  14 +-
 vllm/model_executor/models/deepseek.py        |  20 +-
 vllm/model_executor/models/falcon.py          |  14 +-
 vllm/model_executor/models/gemma.py           |  14 +-
 vllm/model_executor/models/gpt2.py            |  14 +-
 vllm/model_executor/models/gpt_bigcode.py     |  14 +-
 vllm/model_executor/models/gpt_j.py           |  14 +-
 vllm/model_executor/models/gpt_neox.py        |  14 +-
 vllm/model_executor/models/internlm2.py       |  14 +-
 vllm/model_executor/models/jais.py            |  16 +-
 vllm/model_executor/models/llama.py           |  16 +-
 vllm/model_executor/models/llava.py           |  14 +-
 vllm/model_executor/models/minicpm.py         |  14 +-
 vllm/model_executor/models/mixtral.py         |  20 +-
 vllm/model_executor/models/mixtral_quant.py   |  19 +-
 vllm/model_executor/models/mpt.py             |  14 +-
 vllm/model_executor/models/olmo.py            |  16 +-
 vllm/model_executor/models/opt.py             |  14 +-
 vllm/model_executor/models/orion.py           |  14 +-
 vllm/model_executor/models/phi.py             |  14 +-
 vllm/model_executor/models/qwen.py            |  14 +-
 vllm/model_executor/models/qwen2.py           |  14 +-
 vllm/model_executor/models/qwen2_moe.py       |  20 +-
 vllm/model_executor/models/stablelm.py        |  14 +-
 vllm/model_executor/models/starcoder2.py      |  14 +-
 vllm/model_executor/models/xverse.py          |  14 +-
 vllm/transformers_utils/tokenizer.py          |  21 +-
 vllm/worker/cpu_model_runner.py               |  12 +-
 vllm/worker/cpu_worker.py                     |   7 +-
 vllm/worker/model_runner.py                   |  15 +-
 vllm/worker/neuron_model_runner.py            |   2 +-
 vllm/worker/worker.py                         |  10 +-
 66 files changed, 1064 insertions(+), 858 deletions(-)
 rename tests/{tensorizer => tensorizer_loader}/__init__.py (100%)
 rename tests/{tensorizer => tensorizer_loader}/tensorize_vllm_model_for_testing.py (98%)
 rename tests/{tensorizer => tensorizer_loader}/test_tensorizer.py (67%)
 create mode 100644 vllm/model_executor/model_loader/__init__.py
 create mode 100644 vllm/model_executor/model_loader/loader.py
 rename vllm/model_executor/{neuron_model_loader.py => model_loader/neuron.py} (100%)
 rename vllm/model_executor/{tensorizer_loader.py => model_loader/tensorizer.py} (78%)
 create mode 100644 vllm/model_executor/model_loader/utils.py
 rename vllm/model_executor/{ => model_loader}/weight_utils.py (56%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index aa4582bbda0c7..f39c3302ac2e9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -92,7 +92,7 @@ steps:
   parallelism: 4
 
 - label: Tensorizer Test
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer
+  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
 
 - label: Metrics Test
   command: pytest -v -s metrics
diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py
index 5e5b31265e3af..1eb961a5a76e3 100644
--- a/examples/fp8/extract_scales.py
+++ b/examples/fp8/extract_scales.py
@@ -11,7 +11,7 @@
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 
 
-# Adapted from vllm/model_executor/weight_utils.py
+# Adapted from vllm/model_executor/model_loader/weight_utils.py
 # The main differences are that we add the NPZ format and simplify
 # its functionality drastically for our purposes (e.g. we assume that
 # the quantized model exists locally and there is no need to download it)
@@ -71,7 +71,7 @@ def _prepare_hf_weights(
     return hf_weights_files, use_safetensors
 
 
-# Adapted from vllm/model_executor/weight_utils.py
+# Adapted from vllm/model_executor/model_loader/weight_utils.py
 def _hf_tensorfile_iterator(filename: str, load_format: str,
                             use_safetensors: bool):
     if load_format == "npz":
diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index 8cf8be09d0b9c..e2456168de9d5 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -16,8 +16,8 @@
 from vllm.distributed import initialize_model_parallel
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.model_loader.tensorizer import TensorizerArgs
 from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.tensorizer_loader import TensorizerArgs
 
 # yapf conflicts with isort for this docstring
 # yapf: disable
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 1127cc33183c9..2dabfb6b4337c 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -153,11 +153,11 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:
     cleanup()
     get_model_old = get_model
 
-    def get_model_patched(model_config, device_config, **kwargs):
-        return get_model_old(model_config,
-                             device_config,
-                             lora_config=LoRAConfig(max_loras=4,
-                                                    max_lora_rank=8))
+    def get_model_patched(*, model_config, device_config, **kwargs):
+        kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
+        return get_model_old(model_config=model_config,
+                             device_config=device_config,
+                             **kwargs)
 
     with patch("vllm.worker.model_runner.get_model", get_model_patched):
         engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 54594690f7922..732e91a52c0a9 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,8 +3,8 @@
 import tempfile
 from unittest.mock import patch
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.worker.worker import Worker
@@ -18,12 +18,14 @@ def test_worker_apply_lora(sql_lora_files):
             "meta-llama/Llama-2-7b-hf",
             tokenizer_mode="auto",
             trust_remote_code=False,
-            download_dir=None,
-            load_format="dummy",
             seed=0,
             dtype="float16",
             revision=None,
         ),
+        load_config=LoadConfig(
+            download_dir=None,
+            load_format="dummy",
+        ),
         parallel_config=ParallelConfig(1, 1, False),
         scheduler_config=SchedulerConfig(32, 32, 32),
         device_config=DeviceConfig("cuda"),
diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py
index 3154f2826d10c..b0086dd7a7d71 100644
--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -3,7 +3,7 @@
 import huggingface_hub.constants
 import pytest
 
-from vllm.model_executor.weight_utils import enable_hf_transfer
+from vllm.model_executor.model_loader.weight_utils import enable_hf_transfer
 
 
 def test_hf_transfer_auto_activation():
diff --git a/tests/quantization/test_autogptq_marlin_configs.py b/tests/quantization/test_autogptq_marlin_configs.py
index cd64622e2226f..1310b4da218b5 100644
--- a/tests/quantization/test_autogptq_marlin_configs.py
+++ b/tests/quantization/test_autogptq_marlin_configs.py
@@ -36,8 +36,6 @@ def test_auto_gptq(model_quant_type: str, ) -> None:
         model_path,
         tokenizer_mode="auto",
         trust_remote_code=False,
-        download_dir=None,
-        load_format="dummy",
         seed=0,
         dtype="float16",
         revision=None,
@@ -49,8 +47,6 @@ def test_auto_gptq(model_quant_type: str, ) -> None:
         model_path,
         tokenizer_mode="auto",
         trust_remote_code=False,
-        download_dir=None,
-        load_format="dummy",
         seed=0,
         dtype="float16",
         revision=None,
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 7c73943f4df9f..1c08786e69ccd 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -35,8 +35,12 @@ def _prepare_test(
                              dtype=input_tensor.dtype)
     sampler = MockLogitsSampler(fake_logits)
     # UPSTREAM SYNC: passing device required for multi-gpu tests
-    model_runner = ModelRunner(None, None, None, DeviceConfig(device=device),
-                               None)
+    model_runner = ModelRunner(model_config=None,
+                               parallel_config=None,
+                               scheduler_config=None,
+                               device_config=DeviceConfig(device=device),
+                               load_config=None,
+                               lora_config=None)
     return input_tensor, fake_logits, sampler, model_runner
 
 
@@ -603,7 +607,12 @@ def test_sampler_top_k_top_p(seed: int, device: str):
                                device=input_tensor.device,
                                dtype=input_tensor.dtype)
     sampler = MockLogitsSampler(fake_logits)
-    model_runner = ModelRunner(None, None, None, None, None)
+    model_runner = ModelRunner(model_config=None,
+                               parallel_config=None,
+                               scheduler_config=None,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
 
     generation_model = GenerationMixin()
     generation_config = GenerationConfig(top_k=top_k,
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 4637826f254d6..edba4c226b289 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -118,6 +118,7 @@ def create_worker(cls: type,
         scheduler_config=engine_config.scheduler_config,
         device_config=engine_config.device_config,
         cache_config=engine_config.cache_config,
+        load_config=engine_config.load_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/tests/tensorizer/__init__.py b/tests/tensorizer_loader/__init__.py
similarity index 100%
rename from tests/tensorizer/__init__.py
rename to tests/tensorizer_loader/__init__.py
diff --git a/tests/tensorizer/tensorize_vllm_model_for_testing.py b/tests/tensorizer_loader/tensorize_vllm_model_for_testing.py
similarity index 98%
rename from tests/tensorizer/tensorize_vllm_model_for_testing.py
rename to tests/tensorizer_loader/tensorize_vllm_model_for_testing.py
index d0be08329fd64..e4b15fd57add4 100644
--- a/tests/tensorizer/tensorize_vllm_model_for_testing.py
+++ b/tests/tensorizer_loader/tensorize_vllm_model_for_testing.py
@@ -16,8 +16,8 @@
 from vllm.distributed import initialize_model_parallel
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.model_loader.tensorizer import TensorizerArgs
 from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.tensorizer_loader import TensorizerArgs
 
 # yapf conflicts with isort for this docstring
 # yapf: disable
@@ -74,7 +74,7 @@ def parse_args():
                     "extremely quickly. Tensor encryption and decryption is "
                     "also supported, although libsodium must be installed to "
                     "use it.")
-    parser = EngineArgs.add_cli_args(parser)
+    parser = TensorizerArgs.add_cli_args(EngineArgs.add_cli_args(parser))
     subparsers = parser.add_subparsers(dest='command')
 
     serialize_parser = subparsers.add_parser(
diff --git a/tests/tensorizer/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
similarity index 67%
rename from tests/tensorizer/test_tensorizer.py
rename to tests/tensorizer_loader/test_tensorizer.py
index 2ab893e95da9c..a97cc0b3706b4 100644
--- a/tests/tensorizer/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,16 +1,19 @@
 import gc
+import json
+import os
 import subprocess
 from unittest.mock import MagicMock, patch
 
+import openai
 import pytest
+import ray
 import torch
 
 from tests.entrypoints.test_openai_server import ServerRunner
 from vllm import SamplingParams
-from vllm.config import TensorizerConfig
-from vllm.model_executor.tensorizer_loader import (
-    EncryptionParams, TensorSerializer, is_vllm_serialized_tensorizer,
-    load_with_tensorizer, open_stream)
+from vllm.model_executor.model_loader.tensorizer import (
+    EncryptionParams, TensorizerConfig, TensorSerializer,
+    is_vllm_serialized_tensorizer, load_with_tensorizer, open_stream)
 
 prompts = [
     "Hello, my name is",
@@ -22,6 +25,8 @@
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
 
 model_ref = "facebook/opt-125m"
+tensorize_model_for_testing_script = os.path.join(
+    os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
 
 
 def is_curl_installed():
@@ -38,7 +43,7 @@ def tensorizer_config():
     return config
 
 
-@patch('vllm.model_executor.tensorizer_loader.TensorizerAgent')
+@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
 def test_load_with_tensorizer(mock_agent, tensorizer_config):
     mock_linear_method = MagicMock()
     mock_agent_instance = mock_agent.return_value
@@ -81,11 +86,13 @@ def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path):
     del vllm_model, model
     gc.collect()
     torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(model_ref,
-                                    load_format="tensorizer",
-                                    tensorizer_uri=model_path,
-                                    num_readers=1,
-                                    vllm_tensorized=True)
+    loaded_vllm_model = vllm_runner(
+        model_ref,
+        load_format="tensorizer",
+        model_loader_extra_config=TensorizerConfig(tensorizer_uri=model_path,
+                                                   num_readers=1,
+                                                   vllm_tensorized=True),
+    )
     deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
 
     # Assumes SamplingParams being seeded ensures the outputs are deterministic
@@ -97,14 +104,14 @@ def test_can_deserialize_s3(vllm_runner):
     model_ref = "EleutherAI/pythia-1.4b"
     tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
 
-    loaded_hf_model = vllm_runner(
-        model_ref,
-        tensorizer_uri=tensorized_path,
-        load_format="tensorizer",
-        num_readers=1,
-        vllm_tensorized=False,
-        s3_endpoint="object.ord1.coreweave.com",
-    )
+    loaded_hf_model = vllm_runner(model_ref,
+                                  load_format="tensorizer",
+                                  model_loader_extra_config=TensorizerConfig(
+                                      tensorizer_uri=tensorized_path,
+                                      num_readers=1,
+                                      vllm_tensorized=False,
+                                      s3_endpoint="object.ord1.coreweave.com",
+                                  ))
 
     deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
 
@@ -131,11 +138,12 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
     gc.collect()
     torch.cuda.empty_cache()
     loaded_vllm_model = vllm_runner(model_ref,
-                                    tensorizer_uri=model_path,
                                     load_format="tensorizer",
-                                    encryption_keyfile=key_path,
-                                    num_readers=1,
-                                    vllm_tensorized=True)
+                                    model_loader_extra_config=TensorizerConfig(
+                                        tensorizer_uri=model_path,
+                                        encryption_keyfile=key_path,
+                                        num_readers=1,
+                                        vllm_tensorized=True))
 
     deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
 
@@ -156,10 +164,11 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
     gc.collect()
     torch.cuda.empty_cache()
     loaded_hf_model = vllm_runner(model_ref,
-                                  tensorizer_uri=model_path,
                                   load_format="tensorizer",
-                                  num_readers=1,
-                                  vllm_tensorized=False)
+                                  model_loader_extra_config=TensorizerConfig(
+                                      tensorizer_uri=model_path,
+                                      num_readers=1,
+                                      vllm_tensorized=False))
 
     deserialized_outputs = loaded_hf_model.generate_greedy(
         prompts, max_tokens=max_tokens)
@@ -190,10 +199,12 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     torch.cuda.empty_cache()
     loaded_vllm_model = vllm_runner(
         model_ref,
-        tensorizer_uri=model_path,
         load_format="tensorizer",
-        num_readers=1,
-        vllm_tensorized=True,
+        model_loader_extra_config=TensorizerConfig(
+            tensorizer_uri=model_path,
+            num_readers=1,
+            vllm_tensorized=True,
+        ),
         enable_lora=True,
         max_loras=1,
         max_lora_rank=8,
@@ -208,16 +219,18 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
 
 def test_load_without_tensorizer_load_format(vllm_runner):
     with pytest.raises(ValueError):
-        vllm_runner(model_ref, tensorizer_uri="test")
+        vllm_runner(model_ref,
+                    model_loader_extra_config=TensorizerConfig(
+                        tensorizer_uri="test", vllm_tensorized=False))
 
 
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_tensorize_vllm_model(tmp_path):
     # Test serialize command
     serialize_args = [
-        "python3", "tensorizer/tensorize_vllm_model_for_testing.py", "--model",
-        model_ref, "--dtype", "float16", "serialize", "--serialized-directory",
-        tmp_path, "--suffix", "tests"
+        "python3", tensorize_model_for_testing_script, "--model", model_ref,
+        "--dtype", "float16", "serialize", "--serialized-directory", tmp_path,
+        "--suffix", "tests"
     ]
     result = subprocess.run(serialize_args, capture_output=True, text=True)
     print(result.stdout)  # Print the output of the serialize command
@@ -229,8 +242,8 @@ def test_tensorize_vllm_model(tmp_path):
 
     # Test deserialize command
     deserialize_args = [
-        "python3", "tensorizer/tensorize_vllm_model_for_testing.py", "--model",
-        model_ref, "--dtype", "float16", "deserialize", "--path-to-tensors",
+        "python3", tensorize_model_for_testing_script, "--model", model_ref,
+        "--dtype", "float16", "deserialize", "--path-to-tensors",
         path_to_tensors
     ]
     result = subprocess.run(deserialize_args, capture_output=True, text=True)
@@ -242,9 +255,9 @@ def test_tensorize_vllm_model(tmp_path):
 def test_openai_apiserver_with_tensorizer(tmp_path):
     ## Serialize model
     serialize_args = [
-        "python3", "tensorizer/tensorize_vllm_model_for_testing.py", "--model",
-        model_ref, "--dtype", "float16", "serialize", "--serialized-directory",
-        tmp_path, "--suffix", "tests"
+        "python3", tensorize_model_for_testing_script, "--model", model_ref,
+        "--dtype", "float16", "serialize", "--serialized-directory", tmp_path,
+        "--suffix", "tests"
     ]
     result = subprocess.run(serialize_args, capture_output=True, text=True)
     print(result.stdout)  # Print the output of the serialize command
@@ -253,25 +266,47 @@ def test_openai_apiserver_with_tensorizer(tmp_path):
                                     f"\n{result.stdout}\n{result.stderr}")
 
     path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
+    model_loader_extra_config = {
+        "tensorizer_uri": path_to_tensors,
+        "vllm_tensorized": True
+    }
 
     ## Start OpenAI API server
     openai_args = [
         "--model", model_ref, "--dtype", "float16", "--load-format",
-        "tensorizer", "--tensorizer-uri", path_to_tensors, "--vllm-tensorized",
-        "--port", "8000"
+        "tensorizer", "--model-loader-extra-config",
+        json.dumps(model_loader_extra_config), "--port", "8000"
     ]
 
     server = ServerRunner.remote(openai_args)
 
+    assert ray.get(server.ready.remote())
     print("Server ready.")
-    assert server.ready.remote()
+
+    client = openai.OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    completion = client.completions.create(model=model_ref,
+                                           prompt="Hello, my name is",
+                                           max_tokens=5,
+                                           temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
 
 
 def test_raise_value_error_on_invalid_load_format(vllm_runner):
     with pytest.raises(ValueError):
         vllm_runner(model_ref,
                     load_format="safetensors",
-                    tensorizer_uri="test")
+                    model_loader_extra_config=TensorizerConfig(
+                        tensorizer_uri="test", vllm_tensorized=False))
 
 
 def test_tensorizer_with_tp(vllm_runner):
@@ -281,22 +316,12 @@ def test_tensorizer_with_tp(vllm_runner):
 
         vllm_runner(
             model_ref,
-            tensorizer_uri=tensorized_path,
             load_format="tensorizer",
-            num_readers=1,
-            vllm_tensorized=False,
-            s3_endpoint="object.ord1.coreweave.com",
+            model_loader_extra_config=TensorizerConfig(
+                tensorizer_uri=tensorized_path,
+                num_readers=1,
+                vllm_tensorized=False,
+                s3_endpoint="object.ord1.coreweave.com",
+            ),
             tensor_parallel_size=2,
         )
-
-
-@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
-def test_tensorizer_warn_quant(tmp_path):
-    model_ref = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
-    serialize_args = [
-        "python3", "tensorizer/tensorize_vllm_model_for_testing.py", "--model",
-        model_ref, "--quantization", "gptq", "--tensorizer-uri", "test",
-        "serialize", "--serialized-directory", tmp_path, "--suffix", "tests"
-    ]
-    result = subprocess.run(serialize_args, capture_output=True, text=True)
-    assert 'PerformanceWarning' in result.stderr
diff --git a/tests/test_config.py b/tests/test_config.py
index 13a9f76212679..19db10630bbae 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -11,8 +11,6 @@ def test_get_sliding_window():
         "Qwen/Qwen1.5-7B",
         tokenizer_mode="auto",
         trust_remote_code=False,
-        download_dir=None,
-        load_format="dummy",
         seed=0,
         dtype="float16",
         revision=None,
@@ -30,8 +28,6 @@ def test_get_sliding_window():
         "mistralai/Mistral-7B-v0.1",
         tokenizer_mode="auto",
         trust_remote_code=False,
-        download_dir=None,
-        load_format="dummy",
         seed=0,
         dtype="float16",
         revision=None,
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index fe321520114f7..5bb93ca74855b 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -37,7 +37,12 @@ def _prepare_test(
                              1e-2,
                              dtype=input_tensor.dtype)
     logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
-    model_runner = ModelRunner(None, None, None, None, None)
+    model_runner = ModelRunner(model_config=None,
+                               parallel_config=None,
+                               scheduler_config=None,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
     return input_tensor, fake_logits, logits_processor, model_runner
 
 
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index dcaae4af4a6f8..59bed2ce0dad3 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -12,7 +12,12 @@ def test_prepare_prompt(batch_size):
                                        100000,
                                        100000,
                                        enable_chunked_prefill=False)
-    model_runner = ModelRunner(None, None, scheduler_config, None, None)
+    model_runner = ModelRunner(model_config=None,
+                               parallel_config=None,
+                               scheduler_config=scheduler_config,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
     model_runner.set_block_size(16)
 
     prompt_lens = []
@@ -118,8 +123,6 @@ def test_prepare_decode_cuda_graph(batch_size):
         "facebook/opt-125m",
         tokenizer_mode="auto",
         trust_remote_code=False,
-        download_dir=None,
-        load_format="dummy",
         seed=0,
         dtype="float16",
         revision=None,
@@ -129,8 +132,12 @@ def test_prepare_decode_cuda_graph(batch_size):
                                        100000,
                                        100000,
                                        enable_chunked_prefill=False)
-    model_runner = ModelRunner(model_config, None, scheduler_config, None,
-                               None)
+    model_runner = ModelRunner(model_config=model_config,
+                               parallel_config=None,
+                               scheduler_config=scheduler_config,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
     model_runner.set_block_size(16)
 
     prompt_lens = []
@@ -205,14 +212,17 @@ def test_empty_seq_group():
         "facebook/opt-125m",
         tokenizer_mode="auto",
         trust_remote_code=False,
-        download_dir=None,
-        load_format="dummy",
         seed=0,
         dtype="float16",
         revision=None,
         enforce_eager=False,
     )
-    model_runner = ModelRunner(model_config, None, None, None, None)
+    model_runner = ModelRunner(model_config=model_config,
+                               parallel_config=None,
+                               scheduler_config=None,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
     model_runner.set_block_size(16)
     seq_group_metadata_list = []
     input_tokens, input_positions, attn_metadata, _, _, _, slot_mapping = (
@@ -251,8 +261,6 @@ def mock_get_process_group_ranks(group=None):
         "facebook/opt-125m",
         tokenizer_mode="auto",
         trust_remote_code=False,
-        download_dir=None,
-        load_format="dummy",
         seed=0,
         dtype="float16",
         revision=None,
@@ -262,11 +270,12 @@ def mock_get_process_group_ranks(group=None):
                                        100000,
                                        100000,
                                        enable_chunked_prefill=True)
-    model_runner = ModelRunner(model_config,
-                               None,
-                               scheduler_config,
-                               None,
-                               None,
+    model_runner = ModelRunner(model_config=model_config,
+                               parallel_config=None,
+                               scheduler_config=scheduler_config,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None,
                                is_driver_worker=True)
     model_runner.set_block_size(16)
 
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 8edb1cf05c08e..1804cf78d8003 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -23,6 +23,7 @@ def test_swap() -> None:
         scheduler_config=engine_config.scheduler_config,
         device_config=engine_config.device_config,
         cache_config=engine_config.cache_config,
+        load_config=engine_config.load_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/vllm/config.py b/vllm/config.py
index d14ea20b54581..3008d08c9a8f0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,9 +1,7 @@
 import enum
-import io
 import json
 import os
-import typing
-from dataclasses import dataclass, fields
+from dataclasses import dataclass, field, fields
 from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
 
 import torch
@@ -18,10 +16,14 @@
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
 
-    from vllm.model_executor.tensorizer_loader import TensorizerArgs
+    from vllm.model_executor.model_loader.loader import BaseModelLoader
 
 logger = init_logger(__name__)
 
+# If true, will load models from ModelScope instead of Hugging Face Hub.
+VLLM_USE_MODELSCOPE = os.environ.get("VLLM_USE_MODELSCOPE",
+                                     "False").lower() == "true"
+
 _GB = 1 << 30
 
 
@@ -35,18 +37,6 @@ class ModelConfig:
             available, and "slow" will always use the slow tokenizer.
         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
             downloading the model and tokenizer.
-        download_dir: Directory to download and load the weights, default to the
-            default cache directory of huggingface.
-        load_format: The format of the model weights to load:
-            "auto" will try to load the weights in the safetensors format and
-                fall back to the pytorch bin format if safetensors format is
-                not available.
-            "pt" will load the weights in the pytorch bin format.
-            "safetensors" will load the weights in the safetensors format.
-            "npcache" will load the weights in pytorch format and store
-                a numpy cache to speed up the loading.
-            "dummy" will initialize the weights with random values, which is
-                mainly for profiling.
         dtype: Data type for model weights and activations. The "auto" option
             will use FP16 precision for FP32 and FP16 models, and BF16 precision
             for BF16 models.
@@ -83,8 +73,6 @@ def __init__(
         tokenizer: str,
         tokenizer_mode: str,
         trust_remote_code: bool,
-        download_dir: Optional[str],
-        load_format: str,
         dtype: Union[str, torch.dtype],
         seed: int,
         revision: Optional[str] = None,
@@ -103,8 +91,6 @@ def __init__(
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
         self.trust_remote_code = trust_remote_code
-        self.download_dir = download_dir
-        self.load_format = load_format
         self.seed = seed
         self.revision = revision
         self.code_revision = code_revision
@@ -117,66 +103,18 @@ def __init__(
         self.max_context_len_to_capture = max_context_len_to_capture
         self.max_logprobs = max_logprobs
 
-        if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
-            # download model from ModelScope hub,
-            # lazy import so that modelscope is not required for normal use.
-            # pylint: disable=C.
-            from modelscope.hub.snapshot_download import snapshot_download
-
-            if not os.path.exists(model):
-                model_path = snapshot_download(model_id=model,
-                                               cache_dir=download_dir,
-                                               revision=revision)
-            else:
-                model_path = model
-            self.model = model_path
-            self.download_dir = model_path
-            self.tokenizer = model_path
-
         self.hf_config = get_config(self.model, trust_remote_code, revision,
                                     code_revision)
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
                                                      max_model_len)
-        self._verify_load_format()
         self._verify_tokenizer_mode()
         self._verify_quantization()
         # UPSTREAM SYNC: keep sparsity
         self._verify_sparsity()
         self._verify_cuda_graph()
 
-    def _verify_load_format(self) -> None:
-        load_format = self.load_format.lower()
-        supported_load_format = [
-            "auto", "pt", "safetensors", "npcache", "dummy", "tensorizer"
-        ]
-        rocm_not_supported_load_format: List[str] = []
-        if load_format not in supported_load_format:
-            raise ValueError(
-                f"Unknown load format: {self.load_format}. Must be one of "
-                "'auto', 'pt', 'safetensors', 'npcache', 'tensorizer', or "
-                "'dummy'.")
-        if is_hip() and load_format in rocm_not_supported_load_format:
-            rocm_supported_load_format = [
-                f for f in supported_load_format
-                if (f not in rocm_not_supported_load_format)
-            ]
-            raise ValueError(
-                f"load format '{load_format}' is not supported in ROCm. "
-                f"Supported load format are "
-                f"{rocm_supported_load_format}")
-
-        # TODO: Remove this check once HF updates the pt weights of Mixtral.
-        architectures = getattr(self.hf_config, "architectures", [])
-        # architectures can be None instead of []
-        if architectures and "MixtralForCausalLM" in architectures \
-            and load_format == "pt":
-            raise ValueError(
-                "Currently, the 'pt' format is not supported for Mixtral. "
-                "Please use the 'safetensors' format instead. ")
-        self.load_format = load_format
-
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = self.tokenizer_mode.lower()
         if tokenizer_mode not in ["auto", "slow"]:
@@ -503,6 +441,65 @@ def create_config(
         return tokenizer_pool_config
 
 
+class LoadFormat(str, enum.Enum):
+    AUTO = "auto"
+    PT = "pt"
+    SAFETENSORS = "safetensors"
+    NPCACHE = "npcache"
+    DUMMY = "dummy"
+    TENSORIZER = "tensorizer"
+
+
+@dataclass
+class LoadConfig:
+    """
+        download_dir: Directory to download and load the weights, default to the
+            default cache directory of huggingface.
+        load_format: The format of the model weights to load:
+            "auto" will try to load the weights in the safetensors format and
+                fall back to the pytorch bin format if safetensors format is
+                not available.
+            "pt" will load the weights in the pytorch bin format.
+            "safetensors" will load the weights in the safetensors format.
+            "npcache" will load the weights in pytorch format and store
+                a numpy cache to speed up the loading.
+            "dummy" will initialize the weights with random values, which is
+                mainly for profiling.
+            "tensorizer" will use CoreWeave's tensorizer library for
+                fast weight loading.
+    """
+
+    load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
+    download_dir: Optional[str] = None
+    model_loader_extra_config: Optional[Union[str, dict]] = field(
+        default_factory=dict)
+
+    def __post_init__(self):
+        model_loader_extra_config = self.model_loader_extra_config or {}
+        if isinstance(model_loader_extra_config, str):
+            self.model_loader_extra_config = json.loads(
+                model_loader_extra_config)
+        self._verify_load_format()
+
+    def _verify_load_format(self) -> None:
+        if not isinstance(self.load_format, str):
+            return
+
+        load_format = self.load_format.lower()
+        self.load_format = LoadFormat(load_format)
+
+        rocm_not_supported_load_format: List[str] = []
+        if is_hip() and load_format in rocm_not_supported_load_format:
+            rocm_supported_load_format = [
+                f for f in LoadFormat.__members__
+                if (f not in rocm_not_supported_load_format)
+            ]
+            raise ValueError(
+                f"load format '{load_format}' is not supported in ROCm. "
+                f"Supported load formats are "
+                f"{rocm_supported_load_format}")
+
+
 class ParallelConfig:
     """Configuration for the distributed execution.
 
@@ -739,8 +736,6 @@ def maybe_create_spec_config(
             tokenizer=target_model_config.tokenizer,
             tokenizer_mode=target_model_config.tokenizer_mode,
             trust_remote_code=target_model_config.trust_remote_code,
-            download_dir=target_model_config.download_dir,
-            load_format=target_model_config.load_format,
             dtype=target_model_config.dtype,
             seed=target_model_config.seed,
             revision=draft_revision,
@@ -927,65 +922,6 @@ def get_image_input_enum_type(
                              f"{[x.name for x in cls.ImageInputType]}.") from e
 
 
-@dataclass
-class TensorizerConfig:
-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
-                          str, bytes, os.PathLike, int]
-    vllm_tensorized: bool
-    verify_hash: Optional[bool] = False
-    num_readers: Optional[int] = 1
-    encryption_keyfile: Optional[str] = None
-    s3_access_key_id: Optional[str] = None
-    s3_secret_access_key: Optional[str] = None
-    s3_endpoint: Optional[str] = None
-    model_class: Optional[torch.nn.Module] = None
-    hf_config: Optional[PretrainedConfig] = None
-    dtype: Union[str, torch.dtype] = None
-
-    def _construct_tensorizer_args(self) -> "TensorizerArgs":
-        from vllm.model_executor.tensorizer_loader import TensorizerArgs
-        tensorizer_args = {
-            "tensorizer_uri": self.tensorizer_uri,
-            "vllm_tensorized": self.vllm_tensorized,
-            "verify_hash": self.verify_hash,
-            "num_readers": self.num_readers,
-            "encryption_keyfile": self.encryption_keyfile,
-            "s3_access_key_id": self.s3_access_key_id,
-            "s3_secret_access_key": self.s3_secret_access_key,
-            "s3_endpoint": self.s3_endpoint,
-        }
-        return TensorizerArgs(**tensorizer_args)
-
-    def verify_with_parallel_config(
-        self,
-        parallel_config: "ParallelConfig",
-    ) -> None:
-        if (parallel_config.tensor_parallel_size > 1
-                and self.tensorizer_uri is not None):
-            raise ValueError(
-                "Loading to multiple GPUs is not currently supported with "
-                "vLLM-serialized models. Please set tensor_parallel_size=1."
-                " or use a non-vLLM-serialized model, such as a "
-                "serialized Hugging Face `PretrainedModel`.")
-
-    def verify_with_model_config(self, model_config) -> None:
-        if (model_config.quantization is not None
-                and self.tensorizer_uri is not None):
-            from vllm.model_executor.tensorizer_loader import (
-                tensorizer_warning)
-            tensorizer_warning(
-                "Loading a model using Tensorizer with quantization on vLLM"
-                " is unstable and may lead to errors.")
-
-        if (model_config.load_format != "tensorizer"
-                and self.tensorizer_uri is not None):
-            raise ValueError(
-                "A tensorizer uri was passed for tensorizer loading, but the "
-                f"load format was set to {model_config.load_format}. "
-                "Please set the load format to 'tensorizer' to use "
-                f"tensorizer args.")
-
-
 _STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.float16,
     "float16": torch.float16,
@@ -1145,11 +1081,11 @@ class EngineConfig:
     parallel_config: ParallelConfig
     scheduler_config: SchedulerConfig
     device_config: DeviceConfig
+    load_config: LoadConfig
     lora_config: Optional[LoRAConfig]
     vision_language_config: Optional[VisionLanguageConfig]
     speculative_config: Optional[SpeculativeConfig]
     decoding_config: Optional[DecodingConfig]
-    tensorizer_config: Optional[TensorizerConfig]
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
@@ -1157,11 +1093,6 @@ def __post_init__(self):
         self.model_config.verify_with_parallel_config(self.parallel_config)
         self.cache_config.verify_with_parallel_config(self.parallel_config)
 
-        if self.tensorizer_config:
-            self.tensorizer_config.verify_with_parallel_config(
-                self.parallel_config)
-            self.tensorizer_config.verify_with_model_config(self.model_config)
-
         if self.lora_config:
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 16333bf380ae8..ffcd66eb59815 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -2,16 +2,13 @@
 
 import argparse
 import dataclasses
-import io
-import os
 from dataclasses import dataclass
-from typing import BinaryIO, Optional, Union
+from typing import Optional
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
-                         EngineConfig, LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, SpeculativeConfig, TensorizerConfig,
+                         EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          TokenizerPoolConfig, VisionLanguageConfig)
-from vllm.model_executor.tensorizer_loader import TensorizerArgs
 from vllm.utils import str_to_int_tuple
 
 
@@ -64,17 +61,7 @@ class EngineArgs:
     ray_workers_use_nsight: bool = False
     num_gpu_blocks_override: Optional[int] = None
     num_lookahead_slots: int = 0
-
-    # Tensorizer configuration parameters
-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
-                          bytes, os.PathLike, int] = None
-    vllm_tensorized: bool = False
-    verify_hash: Optional[bool] = False
-    num_readers: Optional[int] = 1
-    encryption_keyfile: Optional[str] = None
-    s3_access_key_id: Optional[str] = None
-    s3_secret_access_key: Optional[str] = None
-    s3_endpoint: Optional[str] = None
+    model_loader_extra_config: Optional[dict] = None
 
     # Related to Vision-language models such as llava
     image_input_type: Optional[str] = None
@@ -444,7 +431,16 @@ def add_cli_args(
             default=None,
             help='The number of speculative tokens to sample from '
             'the draft model in speculative decoding')
-        parser = TensorizerArgs.add_cli_args(parser)
+
+        parser.add_argument('--model-loader-extra-config',
+                            type=str,
+                            default=EngineArgs.model_loader_extra_config,
+                            help='Extra config for model loader. '
+                            'This will be passed to the model loader '
+                            'corresponding to the chosen load_format. '
+                            'This should be a JSON string that will be '
+                            'parsed into a dictionary.')
+
         return parser
 
     @classmethod
@@ -458,24 +454,12 @@ def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
     def create_engine_config(self, ) -> EngineConfig:
         device_config = DeviceConfig(self.device)
         model_config = ModelConfig(
-            self.model,
-            self.tokenizer,
-            self.tokenizer_mode,
-            self.trust_remote_code,
-            self.download_dir,
-            self.load_format,
-            self.dtype,
-            self.seed,
-            self.revision,
-            self.code_revision,
-            self.tokenizer_revision,
-            self.max_model_len,
-            self.quantization,
-            self.quantization_param_path,
+            self.model, self.tokenizer, self.tokenizer_mode,
+            self.trust_remote_code, self.dtype, self.seed, self.revision,
+            self.code_revision, self.tokenizer_revision, self.max_model_len,
             # UPSTREAM SYNC: keep sparsity argument
-            self.sparsity,
-            self.enforce_eager,
-            self.max_context_len_to_capture,
+            self.quantization, self.quantization_param_path, self.sparsity,
+            self.enforce_eager,self.max_context_len_to_capture,
             self.max_logprobs)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
@@ -520,15 +504,10 @@ def create_engine_config(self, ) -> EngineConfig:
             max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
             and self.max_cpu_loras > 0 else None) if self.enable_lora else None
 
-        tensorizer_config = TensorizerConfig(
-            tensorizer_uri=self.tensorizer_uri,
-            vllm_tensorized=self.vllm_tensorized,
-            verify_hash=self.verify_hash,
-            num_readers=self.num_readers,
-            encryption_keyfile=self.encryption_keyfile,
-            s3_access_key_id=self.s3_access_key_id,
-            s3_secret_access_key=self.s3_secret_access_key,
-            s3_endpoint=self.s3_endpoint,
+        load_config = LoadConfig(
+            load_format=self.load_format,
+            download_dir=self.download_dir,
+            model_loader_extra_config=self.model_loader_extra_config,
         )
 
         if self.image_input_type:
@@ -558,8 +537,8 @@ def create_engine_config(self, ) -> EngineConfig:
                             lora_config=lora_config,
                             vision_language_config=vision_language_config,
                             speculative_config=speculative_config,
-                            decoding_config=decoding_config,
-                            tensorizer_config=tensorizer_config)
+                            load_config=load_config,
+                            decoding_config=decoding_config)
 
 
 @dataclass
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2d30e1fe0d64a..7fb8c55994e34 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -4,9 +4,9 @@
 from transformers import PreTrainedTokenizer
 
 import vllm
-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, TensorizerConfig,
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
+                         LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig,
                          VisionLanguageConfig)
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
@@ -72,11 +72,11 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
+        load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         vision_language_config: Optional[VisionLanguageConfig],
         speculative_config: Optional[SpeculativeConfig],
         decoding_config: Optional[DecodingConfig],
-        tensorizer_config: Optional[TensorizerConfig],
         executor_class: Type[ExecutorBase],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -92,8 +92,8 @@ def __init__(
             f"trust_remote_code={model_config.trust_remote_code}, "
             f"dtype={model_config.dtype}, "
             f"max_seq_len={model_config.max_model_len}, "
-            f"download_dir={model_config.download_dir!r}, "
-            f"load_format={model_config.load_format}, "
+            f"download_dir={load_config.download_dir!r}, "
+            f"load_format={load_config.load_format}, "
             f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
             f"disable_custom_all_reduce="
             f"{parallel_config.disable_custom_all_reduce}, "
@@ -116,8 +116,8 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.speculative_config = speculative_config
+        self.load_config = load_config
         self.decoding_config = decoding_config or DecodingConfig()
-        self.tensorizer_config = tensorizer_config
         self.log_stats = log_stats
 
         self._init_tokenizer()
@@ -133,7 +133,7 @@ def __init__(
             lora_config=lora_config,
             vision_language_config=vision_language_config,
             speculative_config=speculative_config,
-            tensorizer_config=tensorizer_config,
+            load_config=load_config,
         )
 
         self._initialize_kv_caches()
@@ -273,9 +273,6 @@ def _init_tokenizer(self, **tokenizer_init_kwargs):
     def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)
         self.cache_config.verify_with_parallel_config(self.parallel_config)
-        if self.tensorizer_config:
-            self.tensorizer_config.verify_with_parallel_config(
-                self.parallel_config)
         if self.lora_config:
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index f562e4e0ae3de..426e2c41d8427 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -40,6 +40,7 @@ def _init_worker(self):
             scheduler_config=self.scheduler_config,
             device_config=self.device_config,
             cache_config=self.cache_config,
+            load_config=self.load_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index bbb6ec80f7b7e..8cc04c5299ca1 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,9 +1,9 @@
 from abc import ABC, abstractmethod
 from typing import Dict, List, Optional, Set, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         TensorizerConfig, VisionLanguageConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, VisionLanguageConfig)
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 
@@ -23,20 +23,20 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
+        load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         vision_language_config: Optional[VisionLanguageConfig],
         speculative_config: Optional[SpeculativeConfig],
-        tensorizer_config: Optional[TensorizerConfig],
     ) -> None:
         self.model_config = model_config
         self.cache_config = cache_config
         self.lora_config = lora_config
+        self.load_config = load_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.vision_language_config = vision_language_config
         self.speculative_config = speculative_config
-        self.tensorizer_config = tensorizer_config
 
         self._init_executor()
 
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index bae509f48025b..3a9537effe6d9 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -35,12 +35,12 @@ def _init_worker(self):
             scheduler_config=self.scheduler_config,
             device_config=self.device_config,
             cache_config=self.cache_config,
+            load_config=self.load_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
             vision_language_config=self.vision_language_config,
-            tensorizer_config=self.tensorizer_config,
             is_driver_worker=True,
         )
         self.driver_worker.init_device()
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 7aca5e36107aa..4065c0868d79a 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -147,6 +147,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         model_config = copy.deepcopy(self.model_config)
         parallel_config = copy.deepcopy(self.parallel_config)
         scheduler_config = copy.deepcopy(self.scheduler_config)
+        load_config = copy.deepcopy(self.load_config)
         device_config = copy.deepcopy(self.device_config)
         lora_config = copy.deepcopy(self.lora_config)
         cache_config = copy.deepcopy(self.cache_config)
@@ -165,12 +166,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     scheduler_config=scheduler_config,
                     device_config=device_config,
                     cache_config=cache_config,
+                    load_config=load_config,
                     local_rank=local_rank,
                     rank=rank,
                     distributed_init_method=distributed_init_method,
                     lora_config=lora_config,
                     vision_language_config=vision_language_config,
-                    tensorizer_config=self.tensorizer_config,
                 ))
 
         # Initialize the driver worker with the Worker class.
@@ -187,7 +188,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
             vision_language_config=self.vision_language_config,
-            tensorizer_config=self.tensorizer_config,
+            load_config=self.load_config,
             is_driver_worker=True,
         )
 
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
new file mode 100644
index 0000000000000..6f90e49994fb2
--- /dev/null
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -0,0 +1,30 @@
+from typing import Optional
+
+from torch import nn
+
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.model_executor.model_loader.loader import (BaseModelLoader,
+                                                     get_model_loader)
+from vllm.model_executor.model_loader.utils import (
+    get_architecture_class_name, get_model_architecture)
+
+
+def get_model(
+        *, model_config: ModelConfig, load_config: LoadConfig,
+        device_config: DeviceConfig, parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig, lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
+    loader = get_model_loader(load_config)
+    return loader.load_model(model_config=model_config,
+                             device_config=device_config,
+                             lora_config=lora_config,
+                             vision_language_config=vision_language_config,
+                             parallel_config=parallel_config,
+                             scheduler_config=scheduler_config)
+
+
+__all__ = [
+    "get_model", "get_model_loader", "BaseModelLoader",
+    "get_architecture_class_name", "get_model_architecture"
+]
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
new file mode 100644
index 0000000000000..3b1d125ef8a67
--- /dev/null
+++ b/vllm/model_executor/model_loader/loader.py
@@ -0,0 +1,354 @@
+# ruff: noqa: SIM117
+import copy
+import glob
+import os
+from abc import ABC, abstractmethod
+from typing import (TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple,
+                    Type)
+
+import torch
+from torch import nn
+
+from vllm.config import (VLLM_USE_MODELSCOPE, DeviceConfig, LoadConfig,
+                         LoadFormat, LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerConfig, is_vllm_serialized_tensorizer, load_with_tensorizer,
+    tensorizer_weights_iterator)
+from vllm.model_executor.model_loader.utils import (get_model_architecture,
+                                                    set_default_torch_dtype)
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, filter_files_not_needed_for_inference,
+    get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
+    pt_weights_iterator, safetensors_weights_iterator)
+from vllm.model_executor.models.llava import LlavaForConditionalGeneration
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.linear import LinearMethodBase
+
+_VISION_MODEL_CLASSES = [
+    LlavaForConditionalGeneration,
+]
+
+logger = init_logger(__name__)
+
+
+def _get_linear_method(
+        model_config: ModelConfig,
+        load_config: LoadConfig) -> Optional["LinearMethodBase"]:
+    """Get the (maybe quantized) linear method."""
+    linear_method = None
+    if model_config.quantization is not None:
+        quant_config = get_quant_config(model_config, load_config)
+        capability = torch.cuda.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        if capability < quant_config.get_min_capability():
+            raise ValueError(
+                f"The quantization method {model_config.quantization} is not "
+                "supported for the current GPU. "
+                f"Minimum capability: {quant_config.get_min_capability()}. "
+                f"Current capability: {capability}.")
+        supported_dtypes = quant_config.get_supported_act_dtypes()
+        if model_config.dtype not in supported_dtypes:
+            raise ValueError(
+                f"{model_config.dtype} is not supported for quantization "
+                f"method {model_config.quantization}. Supported dtypes: "
+                f"{supported_dtypes}")
+
+        linear_method = quant_config.get_linear_method()
+    return linear_method
+
+
+def _get_model_initialization_kwargs(
+        model_class: Type[nn.Module], lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig]
+) -> Dict[str, Any]:
+    """Get extra kwargs for model initialization."""
+    extra_kwargs = {}
+    if hasattr(model_class, "supported_lora_modules"):
+        extra_kwargs["lora_config"] = lora_config
+    elif lora_config:
+        raise ValueError(
+            f"Model {model_class.__name__} does not support LoRA, "
+            "but LoRA is enabled. Support for this model may "
+            "be added in the future. If this is important to you, "
+            "please open an issue on github.")
+    elif model_class in _VISION_MODEL_CLASSES:
+        extra_kwargs["vision_language_config"] = vision_language_config
+    return extra_kwargs
+
+
+def _initialize_model(
+        model_config: ModelConfig, load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
+    """Initialize a model with the given configurations."""
+    model_class = get_model_architecture(model_config)[0]
+    linear_method = _get_linear_method(model_config, load_config)
+
+    return model_class(config=model_config.hf_config,
+                       linear_method=linear_method,
+                       **_get_model_initialization_kwargs(
+                           model_class, lora_config, vision_language_config))
+
+
+class BaseModelLoader(ABC):
+    """Base class for model loaders."""
+
+    def __init__(self, load_config: LoadConfig):
+        self.load_config = load_config
+
+    @abstractmethod
+    def load_model(self, *, model_config: ModelConfig,
+                   device_config: DeviceConfig,
+                   lora_config: Optional[LoRAConfig],
+                   vision_language_config: Optional[VisionLanguageConfig],
+                   parallel_config: ParallelConfig,
+                   scheduler_config: SchedulerConfig) -> nn.Module:
+        """Load a model with the given configurations."""
+        ...
+
+
+class DefaultModelLoader(BaseModelLoader):
+    """Model loader that can load different file types from disk."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(f"Model loader extra config is not supported for "
+                             f"load format {load_config.load_format}")
+
+    def _maybe_download_from_modelscope(
+            self, model: str, revision: Optional[str]) -> Optional[str]:
+        """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
+        
+        Returns the path to the downloaded model, or None if the model is not
+        downloaded from ModelScope."""
+        if VLLM_USE_MODELSCOPE:
+            # download model from ModelScope hub,
+            # lazy import so that modelscope is not required for normal use.
+            # pylint: disable=C.
+            from modelscope.hub.snapshot_download import snapshot_download
+
+            if not os.path.exists(model):
+                model_path = snapshot_download(
+                    model_id=model,
+                    cache_dir=self.load_config.download_dir,
+                    revision=revision)
+            else:
+                model_path = model
+            return model_path
+        return None
+
+    def _prepare_weights(self, model_name_or_path: str,
+                         revision: Optional[str],
+                         fall_back_to_pt: bool) -> Tuple[str, List[str], bool]:
+        """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+        model_name_or_path = self._maybe_download_from_modelscope(
+            model_name_or_path, revision) or model_name_or_path
+
+        is_local = os.path.isdir(model_name_or_path)
+        load_format = self.load_config.load_format
+        use_safetensors = False
+        # Some quantized models use .pt files for storing the weights.
+        if load_format == LoadFormat.AUTO:
+            allow_patterns = ["*.safetensors", "*.bin"]
+        elif load_format == LoadFormat.SAFETENSORS:
+            use_safetensors = True
+            allow_patterns = ["*.safetensors"]
+        elif load_format == LoadFormat.PT:
+            allow_patterns = ["*.pt"]
+        elif load_format == LoadFormat.NPCACHE:
+            allow_patterns = ["*.bin"]
+        else:
+            raise ValueError(f"Unknown load_format: {load_format}")
+
+        if fall_back_to_pt:
+            allow_patterns += ["*.pt"]
+
+        if not is_local:
+            hf_folder = download_weights_from_hf(model_name_or_path,
+                                                 self.load_config.download_dir,
+                                                 allow_patterns)
+        else:
+            hf_folder = model_name_or_path
+
+        hf_weights_files: List[str] = []
+        for pattern in allow_patterns:
+            hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+            if len(hf_weights_files) > 0:
+                if pattern == "*.safetensors":
+                    use_safetensors = True
+                break
+
+        if not use_safetensors:
+            hf_weights_files = filter_files_not_needed_for_inference(
+                hf_weights_files)
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`")
+
+        return hf_folder, hf_weights_files, use_safetensors
+
+    def _get_weights_iterator(
+        self, model_name_or_path: str, revision: Optional[str],
+        fall_back_to_pt: bool
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights based on the load format."""
+        hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
+            model_name_or_path, revision, fall_back_to_pt)
+        if self.load_config.load_format == LoadFormat.NPCACHE:
+            # Currently np_cache only support *.bin checkpoints
+            assert use_safetensors is False
+            return np_cache_weights_iterator(model_name_or_path,
+                                             self.load_config.download_dir,
+                                             hf_folder, hf_weights_files)
+        if use_safetensors:
+            return safetensors_weights_iterator(hf_weights_files)
+        return pt_weights_iterator(hf_weights_files)
+
+    def load_model(self, *, model_config: ModelConfig,
+                   device_config: DeviceConfig,
+                   lora_config: Optional[LoRAConfig],
+                   vision_language_config: Optional[VisionLanguageConfig],
+                   parallel_config: ParallelConfig,
+                   scheduler_config: SchedulerConfig) -> nn.Module:
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config,
+                                          lora_config, vision_language_config)
+            model.load_weights(
+                self._get_weights_iterator(model_config.model,
+                                           model_config.revision,
+                                           fall_back_to_pt=getattr(
+                                               model,
+                                               "fall_back_to_pt_during_load",
+                                               True)), )
+        return model.eval()
+
+
+class DummyModelLoader(BaseModelLoader):
+    """Model loader that will set model weights to random values."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(f"Model loader extra config is not supported for "
+                             f"load format {load_config.load_format}")
+
+    def load_model(self, *, model_config: ModelConfig,
+                   device_config: DeviceConfig,
+                   lora_config: Optional[LoRAConfig],
+                   vision_language_config: Optional[VisionLanguageConfig],
+                   parallel_config: ParallelConfig,
+                   scheduler_config: SchedulerConfig) -> nn.Module:
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config,
+                                          lora_config, vision_language_config)
+            # NOTE(woosuk): For accurate performance evaluation, we assign
+            # random values to the weights.
+            initialize_dummy_weights(model)
+        return model.eval()
+
+
+class TensorizerLoader(BaseModelLoader):
+    """Model loader using CoreWeave's tensorizer library."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if isinstance(load_config.model_loader_extra_config, TensorizerConfig):
+            self.tensorizer_config = load_config.model_loader_extra_config
+        else:
+            self.tensorizer_config = TensorizerConfig(
+                **load_config.model_loader_extra_config)
+
+    def _verify_config(self, model_config: ModelConfig,
+                       parallel_config: ParallelConfig):
+        self.tensorizer_config.verify_with_model_config(model_config)
+        self.tensorizer_config.verify_with_parallel_config(parallel_config)
+
+    def _get_weights_iterator(
+            self) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
+        return tensorizer_weights_iterator(tensorizer_args)
+
+    def _load_model_unserialized(
+            self, model_config: ModelConfig, device_config: DeviceConfig,
+            lora_config: Optional[LoRAConfig],
+            vision_language_config: Optional[VisionLanguageConfig]
+    ) -> nn.Module:
+        """Load an unserialized model with tensorizer.
+
+        Unserialized here means "not serialized with tensorizer". This
+        should still be faster than default HuggingFace loading, but will
+        be slower than loading a tensorizer-serialized model.
+        """
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config,
+                                          lora_config, vision_language_config)
+
+            model.load_weights(self._get_weights_iterator())
+        return model.eval()
+
+    def _load_model_serialized(
+            self, model_config: ModelConfig, device_config: DeviceConfig,
+            lora_config: Optional[LoRAConfig],
+            vision_language_config: Optional[VisionLanguageConfig]
+    ) -> nn.Module:
+        """Load a serialized model with tensorizer.
+
+        See the examples/tensorize_vllm_model.py example "
+        script for serializing vLLM models."""
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model_class = get_model_architecture(model_config)[0]
+                linear_method = _get_linear_method(model_config,
+                                                   self.load_config)
+                extra_kwargs = _get_model_initialization_kwargs(
+                    model_class, lora_config, vision_language_config)
+                extra_kwargs["linear_method"] = linear_method
+
+                tensorizer_config = copy.copy(self.tensorizer_config)
+                tensorizer_config.model_class = model_class
+                tensorizer_config.hf_config = model_config.hf_config
+                tensorizer_config.dtype = model_config.dtype
+
+                model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
+        return model.eval()
+
+    def load_model(self, *, model_config: ModelConfig,
+                   device_config: DeviceConfig,
+                   lora_config: Optional[LoRAConfig],
+                   vision_language_config: Optional[VisionLanguageConfig],
+                   parallel_config: ParallelConfig,
+                   scheduler_config: SchedulerConfig) -> nn.Module:
+        self._verify_config(model_config, parallel_config)
+
+        if is_vllm_serialized_tensorizer(self.tensorizer_config):
+            return self._load_model_serialized(model_config, device_config,
+                                               lora_config,
+                                               vision_language_config)
+        return self._load_model_unserialized(model_config, device_config,
+                                             lora_config,
+                                             vision_language_config)
+
+
+def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+    """Get a model loader based on the load format."""
+
+    if isinstance(load_config.load_format, type):
+        return load_config.load_format(load_config)
+
+    if load_config.load_format == LoadFormat.DUMMY:
+        return DummyModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.TENSORIZER:
+        return TensorizerLoader(load_config)
+
+    return DefaultModelLoader(load_config)
diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/model_loader/neuron.py
similarity index 100%
rename from vllm/model_executor/neuron_model_loader.py
rename to vllm/model_executor/model_loader/neuron.py
diff --git a/vllm/model_executor/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer.py
similarity index 78%
rename from vllm/model_executor/tensorizer_loader.py
rename to vllm/model_executor/model_loader/tensorizer.py
index 8550cc97aefe8..ad554844384eb 100644
--- a/vllm/model_executor/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -4,20 +4,20 @@
 import os
 import time
 import typing
-import warnings
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Generator, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
-from vllm.config import TensorizerConfig
+from vllm.config import ModelConfig, ParallelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 
-tensorizer_load_fail = False
+tensorizer_load_fail = None
 
 try:
     from tensorizer import (DecryptionParams, EncryptionParams,
@@ -25,51 +25,78 @@
     from tensorizer.stream_io import open_stream
     from tensorizer.utils import (convert_bytes, get_mem_usage,
                                   no_init_or_tensor)
-except ImportError:
-    tensorizer_load_fail = True
+except ImportError as e:
+    tensorizer_load_fail = e
 
 __all__ = [
     'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
     'TensorSerializer', 'open_stream', 'convert_bytes', 'get_mem_usage',
-    'no_init_or_tensor'
+    'no_init_or_tensor', 'TensorizerConfig'
 ]
 
 logger = init_logger(__name__)
 
 
+@dataclass
+class TensorizerConfig:
+    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
+                          str, bytes, os.PathLike, int]
+    vllm_tensorized: bool
+    verify_hash: Optional[bool] = False
+    num_readers: Optional[int] = 1
+    encryption_keyfile: Optional[str] = None
+    s3_access_key_id: Optional[str] = None
+    s3_secret_access_key: Optional[str] = None
+    s3_endpoint: Optional[str] = None
+    model_class: Optional[Type[torch.nn.Module]] = None
+    hf_config: Optional[PretrainedConfig] = None
+    dtype: Optional[Union[str, torch.dtype]] = None
+
+    def _construct_tensorizer_args(self) -> "TensorizerArgs":
+        tensorizer_args = {
+            "tensorizer_uri": self.tensorizer_uri,
+            "vllm_tensorized": self.vllm_tensorized,
+            "verify_hash": self.verify_hash,
+            "num_readers": self.num_readers,
+            "encryption_keyfile": self.encryption_keyfile,
+            "s3_access_key_id": self.s3_access_key_id,
+            "s3_secret_access_key": self.s3_secret_access_key,
+            "s3_endpoint": self.s3_endpoint,
+        }
+        return TensorizerArgs(**tensorizer_args)
+
+    def verify_with_parallel_config(
+        self,
+        parallel_config: "ParallelConfig",
+    ) -> None:
+        if (parallel_config.tensor_parallel_size > 1
+                and self.tensorizer_uri is not None):
+            raise ValueError(
+                "Loading to multiple GPUs is not currently supported with "
+                "vLLM-serialized models. Please set tensor_parallel_size=1."
+                " or use a non-vLLM-serialized model, such as a "
+                "serialized Hugging Face `PretrainedModel`.")
+
+    def verify_with_model_config(self, model_config: "ModelConfig") -> None:
+        if (model_config.quantization is not None
+                and self.tensorizer_uri is not None):
+            logger.warning(
+                "Loading a model using Tensorizer with quantization on vLLM"
+                " is unstable and may lead to errors.")
+
+
 def load_with_tensorizer(tensorizer_config: TensorizerConfig,
                          **extra_kwargs) -> nn.Module:
     tensorizer = TensorizerAgent(tensorizer_config, **extra_kwargs)
     return tensorizer.deserialize()
 
 
-def tensorizer_warning(message: str):
-    return warnings.warn(message, category=PerformanceWarning, stacklevel=2)
-
-
 def is_vllm_serialized_tensorizer(tensorizer_config: TensorizerConfig) -> bool:
     if tensorizer_config is None:
         return False
     return tensorizer_config.vllm_tensorized
 
 
-class ParameterizedLoadFormat(str):
-    __slots__ = "params"
-
-
-class PerformanceWarning(UserWarning):
-
-    def __str__(self):
-        return (f"{super().__str__()}"
-                " (set the VLLM_SILENCE_PERFORMANCE_WARNINGS"
-                " environment variable to hide this)")
-
-
-if (os.getenv("VLLM_SILENCE_PERFORMANCE_WARNINGS", "").lower()
-        not in ("", "0", "n", "no", "off", "disable")):
-    warnings.simplefilter("ignore", category=PerformanceWarning)
-
-
 @dataclass
 class TensorizerArgs:
     tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
@@ -219,11 +246,17 @@ class TensorizerAgent:
     behavior of the TensorDeserializer when loading tensors from a serialized
     model. For deserializations of HuggingFace models, TensorDeserializer is
     instead used as an iterator directly in the func hf_model_weights_iterator
-    in vllm/model_executor/weight_utils.py
+    in vllm/model_executor/model_loader/weight_utils.py
     """
 
     def __init__(self, tensorizer_config: TensorizerConfig,
                  linear_method: LinearMethodBase, **extra_kwargs):
+        if tensorizer_load_fail is not None:
+            raise ImportError(
+                "Tensorizer is not installed. Please install tensorizer "
+                "to use this feature with `pip install vllm[tensorizer]`."
+            ) from tensorizer_load_fail
+
         self.tensorizer_config = tensorizer_config
         self.tensorizer_args = (
             self.tensorizer_config._construct_tensorizer_args())
@@ -234,11 +267,6 @@ def __init__(self, tensorizer_config: TensorizerConfig,
             self.linear_method = linear_method
         self.model = self._init_model()
 
-        if tensorizer_load_fail:
-            raise ImportError(
-                "Tensorizer is not installed. Please install tensorizer "
-                "to use this feature with `pip install vllm[tensorizer]`.")
-
     def _init_model(self):
         model_args = self.tensorizer_config.hf_config
         model_args.torch_dtype = self.tensorizer_config.dtype
@@ -313,3 +341,23 @@ def deserialize(self):
         self._check_tensors_on_meta_device()
         self._resize_lora_embeddings()
         return self.model.eval()
+
+
+def tensorizer_weights_iterator(
+    tensorizer_args: "TensorizerArgs"
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    logger.warning(
+        "Deserializing HuggingFace models is not optimized for "
+        "loading on vLLM, as tensorizer is forced to load to CPU. "
+        "Consider deserializing a vLLM model instead for faster "
+        "load times. See the examples/tensorize_vllm_model.py example "
+        "script for serializing vLLM models.")
+
+    deserializer_args = tensorizer_args.deserializer_params
+    stream_params = tensorizer_args.stream_params
+    stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
+    with TensorDeserializer(stream, **deserializer_args,
+                            device="cpu") as state:
+        for name, param in state.items():
+            yield name, param
+    del state
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
new file mode 100644
index 0000000000000..a0a3b2784614d
--- /dev/null
+++ b/vllm/model_executor/model_loader/utils.py
@@ -0,0 +1,40 @@
+"""Utilities for selecting and loading models."""
+import contextlib
+from typing import Tuple, Type
+
+import torch
+from torch import nn
+
+from vllm.config import ModelConfig
+from vllm.model_executor.models import ModelRegistry
+
+
+@contextlib.contextmanager
+def set_default_torch_dtype(dtype: torch.dtype):
+    """Sets the default torch dtype to the given dtype."""
+    old_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    yield
+    torch.set_default_dtype(old_dtype)
+
+
+def get_model_architecture(
+        model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
+    architectures = getattr(model_config.hf_config, "architectures", [])
+    # Special handling for quantized Mixtral.
+    # FIXME(woosuk): This is a temporary hack.
+    if (model_config.quantization is not None
+            and "MixtralForCausalLM" in architectures):
+        architectures = ["QuantMixtralForCausalLM"]
+
+    for arch in architectures:
+        model_cls = ModelRegistry.load_model_cls(arch)
+        if model_cls is not None:
+            return (model_cls, arch)
+    raise ValueError(
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def get_architecture_class_name(model_config: ModelConfig) -> str:
+    return get_model_architecture(model_config)[1]
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
similarity index 56%
rename from vllm/model_executor/weight_utils.py
rename to vllm/model_executor/model_loader/weight_utils.py
index 6226f9032a98b..de534a1d0e3c4 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -4,8 +4,9 @@
 import hashlib
 import json
 import os
+import tempfile
 from collections import defaultdict
-from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Any, Generator, Iterable, List, Optional, Tuple
 
 import filelock
 import huggingface_hub.constants
@@ -15,7 +16,7 @@
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 
-from vllm.config import ModelConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.logger import init_logger
 # UPSTREAM SYNC: needed for sparsity
 from vllm.model_executor.layers.parameters import LazyCompressedParameter
@@ -29,8 +30,7 @@
 # can share the same lock without error.
 # lock files in the temp directory will be automatically deleted when the
 # system reboots, so users will not complain about annoying lock files
-temp_dir = os.environ.get('TMPDIR') or os.environ.get(
-    'TEMP') or os.environ.get('TMP') or "/tmp/"
+temp_dir = tempfile.gettempdir()
 
 
 def enable_hf_transfer():
@@ -48,7 +48,7 @@ def enable_hf_transfer():
 enable_hf_transfer()
 
 
-class Disabledtqdm(tqdm):
+class DisabledTqdm(tqdm):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs, disable=True)
@@ -129,7 +129,8 @@ def get_sparse_config(model_config: ModelConfig):
 
 
 # TODO(woosuk): Move this to other place.
-def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
+def get_quant_config(model_config: ModelConfig,
+                     load_config: LoadConfig) -> QuantizationConfig:
     quant_cls = get_quantization_config(model_config.quantization)
     # Read the quantization config from the HF model config, if available.
     hf_quant_config = getattr(model_config.hf_config, "quantization_config",
@@ -140,12 +141,12 @@ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
     is_local = os.path.isdir(model_name_or_path)
     if not is_local:
         # Download the config files.
-        with get_lock(model_name_or_path, model_config.download_dir):
+        with get_lock(model_name_or_path, load_config.download_dir):
             hf_folder = snapshot_download(model_name_or_path,
                                           revision=model_config.revision,
                                           allow_patterns="*.json",
-                                          cache_dir=model_config.download_dir,
-                                          tqdm_class=Disabledtqdm)
+                                          cache_dir=load_config.download_dir,
+                                          tqdm_class=DisabledTqdm)
     else:
         hf_folder = model_name_or_path
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
@@ -168,169 +169,127 @@ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
     return quant_cls.from_config(config)
 
 
-def prepare_hf_model_weights(
-    model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-    load_format: str = "auto",
-    fall_back_to_pt: bool = True,
-    revision: Optional[str] = None,
-) -> Tuple[str, List[str], bool]:
-    # Download model weights from huggingface.
-    is_local = os.path.isdir(model_name_or_path) \
-               and load_format != "tensorizer"
-    use_safetensors = False
-    # Some quantized models use .pt files for storing the weights.
-    if load_format == "auto":
-        allow_patterns = ["*.safetensors", "*.bin"]
-    elif load_format == "safetensors":
-        use_safetensors = True
-        allow_patterns = ["*.safetensors"]
-    elif load_format == "pt":
-        allow_patterns = ["*.pt"]
-    elif load_format == "npcache":
-        allow_patterns = ["*.bin"]
-    elif load_format == "tensorizer":
-        allow_patterns = ["*.tensors"]
-    else:
-        raise ValueError(f"Unknown load_format: {load_format}")
-
-    if fall_back_to_pt:
-        allow_patterns += ["*.pt"]
-
-    if not is_local and load_format != "tensorizer":
-        # Before we download we look at that is available:
-        fs = HfFileSystem()
-        file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
-
-        # depending on what is available we download different things
-        for pattern in allow_patterns:
-            matching = fnmatch.filter(file_list, pattern)
-            if len(matching) > 0:
-                allow_patterns = [pattern]
-                break
-
-        logger.info(f"Using model weights format {allow_patterns}")
-        # Use file lock to prevent multiple processes from
-        # downloading the same model weights at the same time.
-        with get_lock(model_name_or_path, cache_dir):
-            hf_folder = snapshot_download(model_name_or_path,
-                                          allow_patterns=allow_patterns,
-                                          cache_dir=cache_dir,
-                                          tqdm_class=Disabledtqdm,
-                                          revision=revision)
-    else:
-        hf_folder = model_name_or_path
-    hf_weights_files: List[str] = []
+def download_weights_from_hf(model_name_or_path: str,
+                             cache_dir: Optional[str],
+                             allow_patterns: List[str],
+                             revision: Optional[str] = None) -> str:
+    """Download model weights from Hugging Face Hub.
+    
+    Args:
+        model_name_or_path (str): The model name or path.
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        allow_patterns (List[str]): The allowed patterns for the
+            weight files. Files matched by any of the patterns will be
+            downloaded.
+        revision (Optional[str]): The revision of the model.
+
+    Returns:
+        str: The path to the downloaded model weights.
+    """
+    # Before we download we look at that is available:
+    fs = HfFileSystem()
+    file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
+
+    # depending on what is available we download different things
     for pattern in allow_patterns:
-        hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
-        if len(hf_weights_files) > 0:
-            if pattern == "*.safetensors":
-                use_safetensors = True
+        matching = fnmatch.filter(file_list, pattern)
+        if len(matching) > 0:
+            allow_patterns = [pattern]
             break
-    if not use_safetensors:
-        # Exclude files that are not needed for inference.
-        # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
-        blacklist = [
-            "training_args.bin",
-            "optimizer.bin",
-            "optimizer.pt",
-            "scheduler.pt",
-            "scaler.pt",
-        ]
-        hf_weights_files = [
-            f for f in hf_weights_files
-            if not any(f.endswith(x) for x in blacklist)
-        ]
-
-    if load_format == "tensorizer":
-        return hf_folder, hf_weights_files, use_safetensors
-
-    if len(hf_weights_files) == 0:
-        raise RuntimeError(
-            f"Cannot find any model weights with `{model_name_or_path}`")
-
-    return hf_folder, hf_weights_files, use_safetensors
-
-
-def hf_model_weights_iterator(
-    model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-    load_format: Union[Tuple, str] = "auto",
-    revision: Optional[str] = None,
-    fall_back_to_pt: Optional[bool] = True,
-) -> Iterator[Tuple[str, torch.Tensor]]:
-    hf_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights(
-        model_name_or_path,
-        cache_dir=cache_dir,
-        load_format=load_format,
-        fall_back_to_pt=fall_back_to_pt,
-        revision=revision)
-
-    if load_format == "npcache":
-        # Currently np_cache only support *.bin checkpoints
-        assert use_safetensors is False
-
-        # Convert the model weights from torch tensors to numpy arrays for
-        # faster loading.
-        np_folder = os.path.join(hf_folder, "np")
-        os.makedirs(np_folder, exist_ok=True)
-        weight_names_file = os.path.join(np_folder, "weight_names.json")
-        # Use file lock to prevent multiple processes from
-        # dumping the same model weights to numpy at the same time.
-        with get_lock(model_name_or_path, cache_dir):
-            if not os.path.exists(weight_names_file):
-                weight_names = []
-                for bin_file in hf_weights_files:
-                    state = torch.load(bin_file, map_location="cpu")
-                    for name, param in state.items():
-                        param_path = os.path.join(np_folder, name)
-                        with open(param_path, "wb") as f:
-                            np.save(f, param.cpu().detach().numpy())
-                        weight_names.append(name)
-                with open(weight_names_file, "w") as f:
-                    json.dump(weight_names, f)
-
-        with open(weight_names_file, "r") as f:
-            weight_names = json.load(f)
-
-        for name in weight_names:
-            param_path = os.path.join(np_folder, name)
-            with open(param_path, "rb") as f:
-                param = np.load(f)
-            yield name, torch.from_numpy(param)
-    elif load_format == "tensorizer":
-        from vllm.model_executor.tensorizer_loader import (TensorDeserializer,
-                                                           open_stream,
-                                                           tensorizer_warning)
-        tensorizer_args = load_format.params
-        tensorizer_warning(
-            "Deserializing HuggingFace models is not optimized for "
-            "loading on vLLM, as tensorizer is forced to load to CPU. "
-            "Consider deserializing a vLLM model instead for faster "
-            "load times. See the examples/tensorize_vllm_model.py example "
-            "script for serializing vLLM models.")
-
-        deserializer_args = tensorizer_args.deserializer_params
-        stream_params = tensorizer_args.stream_params
-        stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
-        with TensorDeserializer(stream, **deserializer_args,
-                                device="cpu") as state:
-            for name, param in state.items():
+
+    logger.info(f"Using model weights format {allow_patterns}")
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        hf_folder = snapshot_download(model_name_or_path,
+                                      allow_patterns=allow_patterns,
+                                      cache_dir=cache_dir,
+                                      tqdm_class=DisabledTqdm,
+                                      revision=revision)
+    return hf_folder
+
+
+def filter_files_not_needed_for_inference(
+        hf_weights_files: List[str]) -> List[str]:
+    """
+    Exclude files that are not needed for inference.
+
+    See https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
+    """
+    blacklist = [
+        "training_args.bin",
+        "optimizer.bin",
+        "optimizer.pt",
+        "scheduler.pt",
+        "scaler.pt",
+    ]
+    hf_weights_files = [
+        f for f in hf_weights_files
+        if not any(f.endswith(x) for x in blacklist)
+    ]
+    return hf_weights_files
+
+
+def np_cache_weights_iterator(
+    model_name_or_path: str, cache_dir: Optional[str], hf_folder: str,
+    hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model np files.
+
+    Will dump the model weights to numpy files if they are not already dumped.
+    """
+    # Convert the model weights from torch tensors to numpy arrays for
+    # faster loading.
+    np_folder = os.path.join(hf_folder, "np")
+    os.makedirs(np_folder, exist_ok=True)
+    weight_names_file = os.path.join(np_folder, "weight_names.json")
+    # Use file lock to prevent multiple processes from
+    # dumping the same model weights to numpy at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        if not os.path.exists(weight_names_file):
+            weight_names = []
+            for bin_file in hf_weights_files:
+                state = torch.load(bin_file, map_location="cpu")
+                for name, param in state.items():
+                    param_path = os.path.join(np_folder, name)
+                    with open(param_path, "wb") as f:
+                        np.save(f, param.cpu().detach().numpy())
+                    weight_names.append(name)
+            with open(weight_names_file, "w") as f:
+                json.dump(weight_names, f)
+
+    with open(weight_names_file, "r") as f:
+        weight_names = json.load(f)
+
+    for name in weight_names:
+        param_path = os.path.join(np_folder, name)
+        with open(param_path, "rb") as f:
+            param = np.load(f)
+        yield name, torch.from_numpy(param)
+
+
+def safetensors_weights_iterator(
+    hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    for st_file in hf_weights_files:
+        with safe_open(st_file, framework="pt") as f:
+            for name in f.keys():  # noqa: SIM118
+                param = f.get_tensor(name)
                 yield name, param
+
+
+def pt_weights_iterator(
+    hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model bin/pt files."""
+    for bin_file in hf_weights_files:
+        state = torch.load(bin_file, map_location="cpu")
+        for name, param in state.items():
+            yield name, param
         del state
-    elif use_safetensors:
-        for st_file in hf_weights_files:
-            with safe_open(st_file, framework="pt") as f:
-                for name in f.keys():  # noqa: SIM118
-                    param = f.get_tensor(name)
-                    yield name, param
-    else:
-        for bin_file in hf_weights_files:
-            state = torch.load(bin_file, map_location="cpu")
-            for name, param in state.items():
-                yield name, param
-            del state
-            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
 
 
 def kv_cache_scales_loader(
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 30588aecdebe9..69162b0a92d65 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -19,7 +19,7 @@
 # limitations under the License.
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
 import math
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -40,9 +40,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -340,19 +339,14 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             if name == "lm_head.weight":
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 40966ab33631a..14f325e624f41 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 """Inference-only BLOOM model compatible with HuggingFace weights."""
 import math
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -35,9 +35,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -298,14 +297,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if name == "lm_head.weight":
                 continue
             if not name.startswith("transformer."):
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 7b46ba306619a..3cdb7a7bca1c1 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -2,7 +2,7 @@
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -22,9 +22,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import ChatGLMConfig
 
@@ -370,14 +369,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_pos_emb.inv_freq" in name:
                 continue
             if "word_embeddings" in name:
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index aa9b28b676e0b..d80969773e163 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -20,7 +20,7 @@
 
 # This file is based on the LLama model definition file in transformers
 """PyTorch Cohere model."""
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
@@ -41,10 +41,9 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -335,13 +334,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        load_format: str = "auto",
-        revision: Optional[str] = None,
-    ):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -352,8 +345,7 @@ def load_weights(
         ]
         params_dict = dict(self.named_parameters())
         loaded_params = set()
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
                     continue
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 49eb7f1b2c185..179094b8fd7aa 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -18,10 +18,9 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 
@@ -391,20 +390,13 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        load_format: str = "auto",
-        revision: Optional[str] = None,
-    ):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         expert_params_mapping = [(
             "ws" if weight_name in ["w1", "v1"] else "w2s",
             f"experts.mlp.{weight_name}",
         ) for weight_name in ["w1", "v1", "w2"]]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             for param_name, weight_name in expert_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index abf4a462871b0..d476630ee6f11 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -23,16 +23,15 @@
 # limitations under the License.
 """Inference-only DeciLM model compatible with HuggingFace weights."""
 
-from typing import Optional
+from typing import Iterable, Optional, Tuple
 
 import torch
 from transformers import PretrainedConfig
 
 from vllm.config import LoRAConfig
 from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 
 
 class DeciLMForCausalLM(LlamaForCausalLM):
@@ -65,11 +64,7 @@ def __init__(
                          linear_method=linear_method,
                          lora_config=lora_config)
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -79,8 +74,7 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
 
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index c7dd11d07e6da..46101a152ec0d 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Deepseek model."""
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -44,9 +44,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -316,6 +315,8 @@ def forward(
 
 class DeepseekModel(nn.Module):
 
+    fall_back_to_pt_during_load = False
+
     def __init__(
         self,
         config: PretrainedConfig,
@@ -395,11 +396,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -410,12 +407,7 @@ def load_weights(self,
         ]
 
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path,
-                cache_dir,
-                load_format,
-                revision,
-                fall_back_to_pt=False):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 4f1ebcd5fb43c..25ce239d14662 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -19,7 +19,7 @@
 """PyTorch Falcon model."""
 
 import math
-from typing import List, Optional, Union
+from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -40,9 +40,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import RWConfig
 
@@ -399,11 +398,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         total_num_heads = self.config.num_attention_heads
         if self.config.new_decoder_architecture:
             total_num_kv_heads = self.config.num_kv_heads
@@ -413,8 +408,7 @@ def load_weights(self,
             total_num_kv_heads = total_num_heads
         num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if name == "lm_head.weight":
                 # Falcon uses tied embeddings.
                 continue
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index fc1fc35570368..6d01537c5c344 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
 from functools import lru_cache
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -36,9 +36,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 logger = init_logger(__name__)
@@ -346,11 +345,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -361,8 +356,7 @@ def load_weights(self,
         ]
         params_dict = dict(self.named_parameters())
         loaded_params = set()
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             for (param_name, shard_name, shard_id) in stacked_params_mapping:
                 if shard_name not in name:
                     continue
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 43f0d47fcb122..850050c7232d0 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -34,9 +34,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -239,14 +238,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 # GPT-2 ties the weights of the embedding layer and the final
                 # linear layer.
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index cec2d771adfa8..8278ba02514d5 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -18,7 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPTBigCode model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -35,9 +35,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -260,14 +259,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 continue
             if ".attn.bias" in name:
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 5660097652748..7a830d7f9c965 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-J model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -34,9 +34,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -248,11 +247,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -262,8 +257,7 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "attn.bias" in name or "attn.masked_bias" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 2f9e2171cf114..b946aed92ed35 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -34,9 +34,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -262,14 +261,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if ("attention.bias" in name or "attention.masked_bias" in name
                     or "rotary_emb.inv_freq" in name):
                 continue
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 6e9cbd3f9f43f..db1da8bdc4fb9 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -18,9 +18,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -274,19 +273,14 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "w1", 0),
             ("gate_up_proj", "w3", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index a041b0c9a0452..e7ee749e824e4 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -20,7 +20,7 @@
 """Inference-only Jais model compatible with HuggingFace weights."""
 
 import math
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -36,9 +36,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import JAISConfig
 
@@ -303,16 +302,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        load_format: str = "auto",
-        revision: Optional[str] = None,
-    ):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 # GPT-2 ties the weights of the embedding layer and the final
                 # linear layer.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index c86e292e7df1a..016e3b039d1e8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -42,10 +42,9 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, kv_cache_scales_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator,
-                                              kv_cache_scales_loader)
 from vllm.sequence import SamplerOutput
 from vllm.utils import is_hip
 
@@ -376,11 +375,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -390,8 +385,7 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index c2571d0893c8d..314a2792bf167 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -13,10 +13,9 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -198,11 +197,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         # only doing this for language model part for now.
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -213,8 +208,7 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 49eda9c9a8112..f0d72fafcaf70 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 """Inference-only MiniCPM model compatible with HuggingFace weights."""
 import math
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -45,10 +45,9 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -472,11 +471,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -493,8 +488,7 @@ def load_weights(self,
             for weight_name in ["w1", "w2", "w3"]
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index ff552a9d86536..4d1755f2bbe63 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -43,10 +43,9 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -319,6 +318,8 @@ def forward(
 
 
 class MixtralForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -393,11 +394,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -414,12 +411,7 @@ def load_weights(self,
         ]
 
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path,
-                cache_dir,
-                load_format,
-                revision,
-                fall_back_to_pt=False):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
 
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 1f0c0e912beea..acd13cc27f159 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -43,9 +43,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -327,6 +326,7 @@ def forward(
 
 
 class MixtralForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
 
     def __init__(
         self,
@@ -366,11 +366,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -379,12 +375,7 @@ def load_weights(self,
         ]
 
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path,
-                cache_dir,
-                load_format,
-                revision,
-                fall_back_to_pt=False):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index af4cdce29d085..340f63286739b 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -18,9 +18,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
@@ -284,14 +283,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             # Skip loading extra bias for GPTQ models.
             if name.endswith(".bias") and name not in params_dict:
                 continue
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 3513c72879102..b92003bc0e067 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -36,7 +36,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """Inference-only OLMo model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 # this model must need this dependency
@@ -56,9 +56,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -348,16 +347,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        load_format: str = "auto",
-        revision: Optional[str] = None,
-    ):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             # attention
             if ".att" in name:
                 name = name.replace(".att", ".attn.att")
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 3a640850662c0..89263166bca81 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OPT model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -35,9 +35,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -315,11 +314,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -327,8 +322,7 @@ def load_weights(self,
             ("qkv_proj", "v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 continue
             if name.startswith("decoder."):
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index c606ac027e9d9..bbb9fa5347cc8 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -4,7 +4,7 @@
 # Copyright (c) OrionStar Inc.
 # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
 """Inference-only Orion-14B model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -22,9 +22,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -280,11 +279,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -294,8 +289,7 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index e91624da90955..f974b78a0fbda 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -35,7 +35,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """Inference-only Phi-1.5 model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -53,9 +53,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -265,11 +264,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -278,8 +273,7 @@ def load_weights(self,
         ]
         params_dict = dict(self.named_parameters())
 
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 6213a2ded65ab..a77da7cb15984 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -4,7 +4,7 @@
 # Copyright (c) Alibaba Cloud.
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -23,9 +23,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -253,19 +252,14 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "w2", 0),
             ("gate_up_proj", "w1", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 796e30e633e85..71b906e20ac19 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -22,7 +22,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -42,9 +42,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -331,11 +330,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -345,8 +340,7 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             if self.config.tie_word_embeddings and "lm_head.weight" in name:
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index f920b4f5a40c7..59908bc9ef26a 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -22,7 +22,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -46,9 +46,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -366,6 +365,8 @@ def forward(
 
 class Qwen2MoeForCausalLM(nn.Module):
 
+    fall_back_to_pt_during_load = False
+
     def __init__(
         self,
         config: PretrainedConfig,
@@ -404,11 +405,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -419,12 +416,7 @@ def load_weights(self,
         ]
 
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path,
-                cache_dir,
-                load_format,
-                revision,
-                fall_back_to_pt=False):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 651598b770f13..3e6c2db6f3c65 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -19,7 +19,7 @@
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -37,9 +37,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -262,11 +261,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -276,8 +271,7 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 76e8e48673413..b90f3da141c2e 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -18,7 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch Starcoder2 model."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -36,9 +36,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -274,11 +273,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -287,8 +282,7 @@ def load_weights(self,
         ]
 
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
 
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index d7b9901931b85..adec5b856edd5 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Xverse model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -40,9 +40,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
 
@@ -331,11 +330,7 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
@@ -344,8 +339,7 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if ("rotary_emb.inv_freq" in name
                     or "rotary_emb.cos_cached" in name
                     or "rotary_emb.sin_cached" in name):
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 5d3d5801c960d..c98a673bfed4b 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,8 +1,10 @@
+import os
 from typing import Optional, Union
 
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                           PreTrainedTokenizerFast)
 
+from vllm.config import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizers import BaichuanTokenizer
@@ -57,9 +59,26 @@ def get_tokenizer(
     tokenizer_mode: str = "auto",
     trust_remote_code: bool = False,
     tokenizer_revision: Optional[str] = None,
+    download_dir: Optional[str] = None,
     **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-    """Gets a tokenizer for the given model name via Huggingface."""
+    """Gets a tokenizer for the given model name via Huggingface/modelscope."""
+    if VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        # pylint: disable=C.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # Only set the tokenizer here, model will be downloaded on the workers.
+        if not os.path.exists(tokenizer_name):
+            tokenizer_path = snapshot_download(
+                model_id=tokenizer_name,
+                cache_dir=download_dir,
+                revision=tokenizer_revision,
+                # Ignore weights - we only need the tokenizer.
+                ignore_file_pattern=["*.pt", "*.safetensors", "*.bin"])
+            tokenizer_name = tokenizer_path
+
     if tokenizer_mode == "slow":
         if kwargs.get("use_fast", False):
             raise ValueError(
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 49e1ad5709f5d..d378e3a90e1e7 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -3,8 +3,8 @@
 import torch
 
 from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig)
 from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
@@ -26,6 +26,7 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
+        load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
@@ -36,6 +37,7 @@ def __init__(
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.lora_config = lora_config
+        self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
         # model_config can be None in tests/samplers/test_sampler.py.
@@ -55,8 +57,10 @@ def __init__(
             self.model_config.dtype if model_config is not None else None)
 
     def load_model(self) -> None:
-        self.model = get_model(self.model_config,
-                               self.device_config,
+        self.model = get_model(model_config=self.model_config,
+                               load_config=self.load_config,
+                               device_config=self.device_config,
+                               vision_language_config=None,
                                lora_config=self.lora_config,
                                parallel_config=self.parallel_config,
                                scheduler_config=self.scheduler_config)
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 41341b063bed7..6610b9c4be876 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -5,8 +5,8 @@
 import torch.distributed
 
 from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
                               init_distributed_environment)
@@ -117,6 +117,7 @@ def __init__(
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
         cache_config: CacheConfig,
+        load_config: LoadConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
@@ -129,6 +130,7 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
+        self.load_config = load_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
@@ -141,6 +143,7 @@ def __init__(
                                            parallel_config,
                                            scheduler_config,
                                            device_config,
+                                           load_config=self.load_config,
                                            lora_config=self.lora_config,
                                            kv_cache_dtype=kv_cache_dtype,
                                            is_driver_worker=is_driver_worker)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 12ef80ab60f20..1b816afe7c0e7 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -11,9 +11,8 @@
 
 from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
                             get_attn_backend)
-from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, TensorizerConfig,
-                         VisionLanguageConfig)
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict, with_pynccl_for_all_reduce
 from vllm.distributed.device_communicators import (custom_all_reduce,
                                                    pynccl_utils)
@@ -110,17 +109,17 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
+        load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         vision_language_config: Optional[VisionLanguageConfig] = None,
-        tensorizer_config: Optional[TensorizerConfig] = None,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.lora_config = lora_config
-        self.tensorizer_config = tensorizer_config
+        self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
         # model_config can be None in tests/samplers/test_sampler.py.
@@ -158,13 +157,13 @@ def __init__(
     def load_model(self) -> None:
         with CudaMemoryProfiler() as m:
             self.model = get_model(
-                self.model_config,
-                self.device_config,
+                model_config=self.model_config,
+                device_config=self.device_config,
+                load_config=self.load_config,
                 lora_config=self.lora_config,
                 vision_language_config=self.vision_language_config,
                 parallel_config=self.parallel_config,
                 scheduler_config=self.scheduler_config,
-                tensorizer_config=self.tensorizer_config,
             )
 
         self.model_memory_usage = m.consumed_memory
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index fff721a80c204..f70a7193effeb 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -6,7 +6,7 @@
                          SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.neuron_model_loader import get_neuron_model
+from vllm.model_executor.model_loader.neuron import get_neuron_model
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 82491c6df6616..6a79285f60579 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -6,8 +6,8 @@
 import torch
 import torch.distributed
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, TensorizerConfig,
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
@@ -38,12 +38,12 @@ def __init__(
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
         cache_config: CacheConfig,
+        load_config: LoadConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
         vision_language_config: Optional[VisionLanguageConfig] = None,
-        tensorizer_config: Optional[TensorizerConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         self.model_config = model_config
@@ -55,7 +55,7 @@ def __init__(
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
-        self.tensorizer_config = tensorizer_config
+        self.load_config = load_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -70,11 +70,11 @@ def __init__(
             parallel_config,
             scheduler_config,
             device_config,
+            load_config=load_config,
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
             vision_language_config=vision_language_config,
-            tensorizer_config=tensorizer_config,
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.

From 945a6b70bcf1fdb64c56e0fae92b8bf50d845eb9 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 16 Apr 2024 13:09:21 -0700
Subject: [PATCH 028/106] [Speculative decoding 6/9] Integrate speculative
 decoding with LLMEngine  (#3894)

---
 tests/core/block/e2e/test_correctness.py      |  70 ++++
 tests/core/utils.py                           |  17 +-
 .../output_processor/test_multi_step.py       | 270 +++++++++++++
 tests/spec_decode/e2e/test_correctness.py     | 127 +++++-
 tests/spec_decode/test_multi_step_worker.py   |   4 +-
 tests/spec_decode/test_spec_decode_worker.py  |  32 +-
 tests/spec_decode/utils.py                    |   2 +-
 vllm/core/block/block_table.py                |   1 -
 vllm/core/scheduler.py                        |   8 +-
 vllm/engine/async_llm_engine.py               |   4 +-
 vllm/engine/llm_engine.py                     | 371 +++---------------
 vllm/engine/output_processor/__init__.py      |   0
 vllm/engine/output_processor/interfaces.py    |  69 ++++
 vllm/engine/output_processor/multi_step.py    | 126 ++++++
 vllm/engine/output_processor/single_step.py   | 276 +++++++++++++
 vllm/engine/output_processor/stop_checker.py  | 101 +++++
 vllm/engine/output_processor/util.py          |  16 +
 vllm/executor/cpu_executor.py                 |   3 +-
 vllm/executor/executor_base.py                |   5 +-
 vllm/executor/gpu_executor.py                 |  79 +++-
 vllm/executor/neuron_executor.py              |   5 +-
 vllm/executor/ray_gpu_executor.py             |   3 +-
 vllm/sequence.py                              |  13 +
 vllm/spec_decode/batch_expansion.py           |  23 +-
 vllm/spec_decode/multi_step_worker.py         |  16 +-
 vllm/spec_decode/spec_decode_worker.py        |  44 ++-
 vllm/spec_decode/util.py                      |  26 ++
 vllm/worker/cpu_worker.py                     |   8 +-
 vllm/worker/neuron_worker.py                  |  11 +-
 vllm/worker/worker.py                         |  11 +-
 vllm/worker/worker_base.py                    |  13 +-
 31 files changed, 1347 insertions(+), 407 deletions(-)
 create mode 100644 tests/engine/output_processor/test_multi_step.py
 create mode 100644 vllm/engine/output_processor/__init__.py
 create mode 100644 vllm/engine/output_processor/interfaces.py
 create mode 100644 vllm/engine/output_processor/multi_step.py
 create mode 100644 vllm/engine/output_processor/single_step.py
 create mode 100644 vllm/engine/output_processor/stop_checker.py
 create mode 100644 vllm/engine/output_processor/util.py

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 94b65401e1dd4..0ee78a9b0a8ea 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -230,6 +230,76 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 2,
+            "max_num_seqs": 2,
+        },
+    ])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [
+    {
+        "use_v2_block_manager": False,
+    },
+])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "use_v2_block_manager": True,
+        "num_lookahead_slots": 0,
+    },
+    {
+        "use_v2_block_manager": True,
+        "num_lookahead_slots": 5,
+    },
+])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
+                                          test_llm_generator, batch_size):
+    """Verify that chunked prefill works with BlockManagerV2, with and without
+    lookahead scheduling.
+    """
+    output_len = 32
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids with BlockManagerV1')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids with BlockManagerV2')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
 def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
     for llm in llm_generator:
         outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
diff --git a/tests/core/utils.py b/tests/core/utils.py
index fbbdb07cb8e6e..22c1d3826dff4 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,5 +1,5 @@
 import time
-from typing import Optional, Tuple
+from typing import Iterable, Optional, Tuple
 
 from vllm import SamplingParams
 from vllm.lora.request import LoRARequest
@@ -31,14 +31,17 @@ def create_dummy_prompt(
 
 
 def create_seq_group(
-    seq_prompt_len=1024,
-    seq_output_lens=(128, ),
-    request_id='0',
-    seq_id_start=0,
-) -> SequenceGroup:
+        seq_prompt_len: int = 1024,
+        seq_output_lens: Iterable[int] = (128, ),
+        request_id: str = '0',
+        seq_id_start: int = 0,
+        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
 
     assert len(seq_output_lens) > 0
 
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+
     prompt_token_ids = [0] * seq_prompt_len
 
     seqs = []
@@ -60,7 +63,7 @@ def create_seq_group(
     seq_group = SequenceGroup(
         request_id=request_id,
         seqs=seqs,
-        sampling_params=SamplingParams(),
+        sampling_params=sampling_params,
         arrival_time=time.time(),
     )
 
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
new file mode 100644
index 0000000000000..6da3da091db78
--- /dev/null
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -0,0 +1,270 @@
+import random
+from unittest.mock import MagicMock
+
+import pytest
+from transformers import PreTrainedTokenizer
+
+from tests.core.utils import create_seq_group
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput,
+                           SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [1, 12])
+@pytest.mark.skip_global_cleanup
+def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
+    """Verify multi-step decoding appends token ids correctly.
+
+    We append token ids and verify all the token ids were appended correctly.
+    Note that ignore_eos=True.
+    """
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=scheduler,
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=1024,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(max_tokens=seq_output_len +
+                                       num_new_tokens,
+                                       ignore_eos=True),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
+    output_processor.process_outputs(seq_group, outputs)
+    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
+@pytest.mark.parametrize("max_tokens", [128 + 3])
+@pytest.mark.skip_global_cleanup
+def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
+                             seq_output_len: int, max_tokens: int):
+    """Verify tokens after max_tokens are dropped and not appended to the
+    sequence.
+    """
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=scheduler,
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(max_tokens=max_tokens, ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to not go over max tokens in len.
+    assert seq.get_len() == seq_prompt_len + max_tokens
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+                               seq_output_len: int, seed: int):
+    """Verify the eos token id is included in the sequence, but subsequent
+    tokens are dropped (not appended to sequence).
+    """
+    random.seed(seed)
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    eos_token_id = 100
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=scheduler,
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(
+            # Ensure enough space.
+            max_tokens=seq_output_len + num_new_tokens, ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+    assert eos_token_id not in new_token_ids
+    eos_index = random.randint(0, len(new_token_ids) - 1)
+    new_token_ids[eos_index] = eos_token_id
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to not go beyond provided eos.
+    assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:eos_index + 1]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+                              seq_output_len: int, seed: int):
+    """When sampling parameters dictate that we should ignore the eos token id,
+    ensure all token ids are appended even if the eos token id is emitted.
+    """
+    random.seed(seed)
+    detokenizer = MagicMock(spec=Detokenizer)
+    scheduler = MagicMock(spec=Scheduler)
+    stop_checker = MagicMock(spec=StopChecker)
+    seq_counter = Counter()
+
+    eos_token_id = 100
+
+    output_processor = MultiStepOutputProcessor(
+        detokenizer=detokenizer,
+        scheduler=scheduler,
+        seq_counter=seq_counter,
+        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+        stop_checker=stop_checker,
+    )
+
+    seq_group = create_seq_group(
+        seq_prompt_len=seq_prompt_len,
+        seq_output_lens=[seq_output_len],
+        sampling_params=SamplingParams(
+            # Ensure enough space.
+            max_tokens=seq_output_len + num_new_tokens,
+            ignore_eos=True,
+        ),
+    )
+
+    seq = seq_group.get_seqs()[0]
+    seq.status = SequenceStatus.RUNNING
+
+    new_token_ids = list(range(num_new_tokens))
+    assert eos_token_id not in new_token_ids
+    eos_index = random.randint(0, len(new_token_ids) - 1)
+    new_token_ids[eos_index] = eos_token_id
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=output_token,
+                    logprobs={output_token: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for output_token in new_token_ids
+    ]
+
+    assert seq.get_len() == seq_prompt_len + seq_output_len
+    output_processor.process_outputs(seq_group, outputs)
+
+    # Expect the processed sequence to go beyond eos.
+    assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
+
+    # Expect the correct tokens were appended.
+    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
+                                             seq_output_len]
+    assert seq.get_token_ids(
+    )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+def mock_tokenizer(eos_token_id=1000):
+    tokenizer = MagicMock(spec=PreTrainedTokenizer)
+    tokenizer.eos_token_id = eos_token_id
+    return tokenizer
diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py
index b5a6fcb7900a3..a8ebd66841eb2 100644
--- a/tests/spec_decode/e2e/test_correctness.py
+++ b/tests/spec_decode/e2e/test_correctness.py
@@ -1,4 +1,8 @@
+from itertools import cycle
+from typing import List, Tuple
+
 import pytest
+from transformers import AutoTokenizer
 
 from vllm import SamplingParams
 
@@ -7,18 +11,47 @@
     "common_llm_kwargs",
     [{
         # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        "speculative_model": "facebook/opt-125m",
-        "num_speculative_tokens": 5,
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip real loading for fast test.
+        "load_format": "dummy",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
 
         # Required for spec decode.
         "use_v2_block_manager": True
     }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 1,
+        },
+        {
+            # No spec decode.
+        },
+    ])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [1])
+# NOTE: We should run more permutations of this test (more BS, more seeds). But
+# because our spec decode generates gibberish token ids, the likelihood of
+# emitting an invalid token combination is nontrivial. This causes divergence in
+# behavior of vLLM detokenization vs. hf tokenizer, for example when two "utf-
+# start" bytes are emitted.
 @pytest.mark.parametrize("seed", [1])
-def test_spec_decode_config(test_llm_generator):
-    output_len = 1024
+def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int):
+    """Run generation with speculative decoding on a batch. Verify the engine
+    generates the correct number of tokens (via ignore_eos=True), and that the
+    detokenization matches HF transformers.
+    """
+    output_len = 32
     temperature = 0.0
 
     prompts = [
@@ -28,23 +61,91 @@ def test_spec_decode_config(test_llm_generator):
         "The future of AI is",
     ]
 
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+        skip_special_tokens=True,
+        spaces_between_special_tokens=False,
+    )
+
+    batch_tokens, batch_token_ids = get_output_from_llm_generator(
+        test_llm_generator, prompts, sampling_params)
+
+    # Expect a generation for each prompt in the batch.
+    assert len(batch_token_ids) == len(prompts)
+
+    # Expect each generation to have expected number of tokens (note
+    # ignore_eos=True).
+    assert all(len(token_ids) == output_len for token_ids in batch_token_ids)
+
+    # Expect detokenized string to match.
+    tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
+    for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids):
+        expected_tokens = tok.decode(actual_token_ids)
+        print(f"{actual_token_ids=}")
+        assert actual_tokens.strip() == expected_tokens.strip()
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "JackFram/llama-68m",
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+
+        # Skip real loading for fast test.
+        "load_format": "dummy",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            # Expect failure as spec decode not supported by
+            # Ray backend.
+            "worker_use_ray": True,
+        },
+    ])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail(test_llm_generator):
+    """Verify that speculative decoding with Ray fails.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+    ]
+
     sampling_params = SamplingParams(
         max_tokens=output_len,
         ignore_eos=True,
         temperature=temperature,
     )
 
-    with pytest.raises(
-            AssertionError,
-            match="Speculative decoding not yet supported for GPU backend"):
-        get_token_ids_from_llm_generator(test_llm_generator, prompts,
-                                         sampling_params)
+    with pytest.raises(AssertionError,
+                       match="Speculative decoding not yet supported for "):
+        get_output_from_llm_generator(test_llm_generator, prompts,
+                                      sampling_params)
 
 
-def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
+def get_output_from_llm_generator(
+        llm_generator, prompts,
+        sampling_params) -> Tuple[List[str], List[List[int]]]:
     for llm in llm_generator:
         outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
         token_ids = [output.outputs[0].token_ids for output in outputs]
+        tokens = [output.outputs[0].text for output in outputs]
         del llm
 
-    return token_ids
+    return tokens, token_ids
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index f4d44108b47c2..d6edbab579afd 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -125,7 +125,7 @@ def test_same_output_for_single_step():
     zero_kv_cache(worker.cache_engine)
     set_random_seed(seed)
     expected_output = worker.execute_model(
-        **single_step_execute_model_data.to_dict(), )
+        **single_step_execute_model_data.to_dict(), )[0]
 
     actual_token_ids = [
         output.samples[0].output_token for output in actual_output
@@ -219,7 +219,7 @@ def test_same_output_for_multi_step():
                 continuations=continuations,
                 final_seq_lens=final_seq_lens))
 
-        single_step_output.append(
+        single_step_output.extend(
             worker.execute_model(**execute_model_data.to_dict(), ))
 
         # Append output tokens to new sequence data.
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 47aff8f575413..0a3110775e2d6 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -6,6 +6,7 @@
 
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                       SpecDecodeWorkerMetrics)
@@ -37,7 +38,8 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
     execute_model_data, _, _ = create_batch(batch_size, k)
 
     with pytest.raises(ValueError, match=exception_secret):
-        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+        worker.execute_model(**execute_model_data.to_dict(),
+                             num_lookahead_slots=k)
 
     call_args_list = draft_worker.get_spec_proposals.call_args_list
     assert len(call_args_list) == 1
@@ -102,7 +104,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
     target_worker.execute_model.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
-        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+        worker.execute_model(**execute_model_data.to_dict(),
+                             num_lookahead_slots=k)
 
     seen_contexts = []
 
@@ -189,13 +192,14 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
     target_output = create_sampler_output_list(target_token_ids,
                                                target_token_probs)
 
-    target_worker.execute_model.return_value = target_output[0]
+    target_worker.execute_model.return_value = [target_output[0]]
 
     exception_secret = 'artifical stop'
     rejection_sampler.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
-        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+        worker.execute_model(**execute_model_data.to_dict(),
+                             num_lookahead_slots=k)
 
     assert len(rejection_sampler.call_args_list) == 1
     args, _ = rejection_sampler.call_args_list[0]
@@ -268,7 +272,7 @@ def test_correctly_formats_output(k: int, batch_size: int):
     target_output = create_sampler_output_list(target_token_ids,
                                                target_token_probs)
 
-    target_worker.execute_model.return_value = target_output[0]
+    target_worker.execute_model.return_value = [target_output[0]]
 
     rejection_sampler_output = torch.randint(low=0,
                                              high=vocab_size,
@@ -283,7 +287,7 @@ def test_correctly_formats_output(k: int, batch_size: int):
     rejection_sampler.return_value = rejection_sampler_output
 
     output = worker.execute_model(**execute_model_data.to_dict(),
-                                  num_spec_tokens=k)
+                                  num_lookahead_slots=k)
 
     expected_output = create_sampler_output_list(
         rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)])
@@ -380,7 +384,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
     target_output = create_sampler_output_list(target_token_ids,
                                                target_token_probs)
 
-    target_worker.execute_model.return_value = target_output[0]
+    target_worker.execute_model.return_value = [target_output[0]]
 
     rejection_sampler_output = torch.randint(low=0,
                                              high=vocab_size,
@@ -400,7 +404,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
         mock_rejsample_metrics)
 
     output = worker.execute_model(**execute_model_data.to_dict(),
-                                  num_spec_tokens=k)
+                                  num_lookahead_slots=k)
     assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics
 
     call_args_list = (
@@ -423,6 +427,8 @@ def test_k_equals_zero(k: int, batch_size: int):
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
+    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
@@ -435,7 +441,7 @@ def test_k_equals_zero(k: int, batch_size: int):
         batch_size, k, prev_output_token_len=0)
 
     out = worker.execute_model(**execute_model_data.to_dict(),
-                               num_spec_tokens=k)
+                               num_lookahead_slots=k)
 
     assert len(out) == 1, f"expected only one token output when {k=}"
     assert out[0].probs is None, "expect gpu tensor references to be None"
@@ -443,7 +449,7 @@ def test_k_equals_zero(k: int, batch_size: int):
         0].sampled_tokens is None, "expect gpu tensor references to be None"
 
     draft_worker.execute_model.assert_called_once_with(
-        **execute_model_data.to_dict(), return_python_output=False)
+        **execute_model_data.to_dict())
     target_worker.execute_model.assert_called_once_with(
         **execute_model_data.to_dict())
 
@@ -462,6 +468,8 @@ def test_empty_input_batch(k: int, batch_size: int):
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
+    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+
     draft_worker.device = 'cuda'
     target_worker.device = 'cuda'
 
@@ -474,7 +482,7 @@ def test_empty_input_batch(k: int, batch_size: int):
         batch_size, k, prev_output_token_len=0)
 
     out = worker.execute_model(**execute_model_data.to_dict(),
-                               num_spec_tokens=k)
+                               num_lookahead_slots=k)
 
     assert len(out) == 1, f"expected only one token output when {k=}"
     assert out[0].probs is None, "expect gpu tensor references to be None"
@@ -482,7 +490,7 @@ def test_empty_input_batch(k: int, batch_size: int):
         0].sampled_tokens is None, "expect gpu tensor references to be None"
 
     draft_worker.execute_model.assert_called_once_with(
-        **execute_model_data.to_dict(), return_python_output=False)
+        **execute_model_data.to_dict())
     target_worker.execute_model.assert_called_once_with(
         **execute_model_data.to_dict())
 
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index edba4c226b289..d04b6029493f4 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -212,7 +212,7 @@ def create_sampler_output_list(
                     SequenceOutput(
                         output_token=token_id,
                         parent_seq_id=seq_ids[seq_index],
-                        logprobs={token_id: 0},
+                        logprobs={token_id: Logprob(0)},
                     )
                 ],
                 prompt_logprobs=None,
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index ba061bbc4fbcb..560267e55ea3a 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -104,7 +104,6 @@ def append_token_ids(self,
             token_ids (List[int]): The sequence of token IDs to be appended.
         """
         assert self._is_allocated
-        assert token_ids, "can't append empty token ids"
 
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
                                     num_lookahead_slots)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 4da7600387f05..76379cf82bd1e 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -762,9 +762,7 @@ def _schedule_default(self) -> SchedulerOutputs:
             blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
                                        swapped_in.blocks_to_copy),
             ignored_seq_groups=prefills.ignored_seq_groups,
-            num_lookahead_slots=(prefills.num_lookahead_slots +
-                                 running_scheduled.num_lookahead_slots +
-                                 swapped_in.num_lookahead_slots),
+            num_lookahead_slots=running_scheduled.num_lookahead_slots,
         )
 
     def _schedule_chunked_prefill(self):
@@ -850,9 +848,7 @@ def _schedule_chunked_prefill(self):
             blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
                                        swapped_in.blocks_to_copy),
             ignored_seq_groups=prefills.ignored_seq_groups,
-            num_lookahead_slots=(prefills.num_lookahead_slots +
-                                 running_scheduled.num_lookahead_slots +
-                                 swapped_in.num_lookahead_slots),
+            num_lookahead_slots=running_scheduled.num_lookahead_slots,
         )
 
     def _schedule(self) -> SchedulerOutputs:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 1dbf58904541c..27192449bf15a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -217,7 +217,9 @@ async def step_async(self) -> List[RequestOutput]:
         else:
             output = []
 
-        return self._process_model_outputs(output, scheduler_outputs)
+        return self._process_model_outputs(
+            output, scheduler_outputs.scheduled_seq_groups,
+            scheduler_outputs.ignored_seq_groups)
 
     async def encode_request_async(
         self,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7fb8c55994e34..ca788d37c8f40 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,5 +1,5 @@
 import time
-from typing import Iterable, List, Optional, Tuple, Type, Union
+from typing import Iterable, List, Optional, Type, Union
 
 from transformers import PreTrainedTokenizer
 
@@ -11,6 +11,10 @@
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics import StatLogger, Stats
+from vllm.engine.output_processor.interfaces import (
+    SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.engine.ray_utils import initialize_ray_cluster
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
@@ -18,8 +22,7 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (MultiModalData, SamplerOutput, Sequence,
-                           SequenceGroup, SequenceGroupOutput, SequenceOutput,
-                           SequenceStatus)
+                           SequenceGroup)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
@@ -189,6 +192,21 @@ def __init__(
                 labels=dict(model_name=model_config.model))
             self.stat_logger.info("cache_config", self.cache_config)
 
+        # Create sequence output processor, e.g. for beam search or
+        # speculative decoding.
+        self.output_processor = (
+            SequenceGroupOutputProcessor.create_output_processor(
+                self.scheduler_config,
+                self.detokenizer,
+                self.scheduler,
+                self.seq_counter,
+                self.get_tokenizer_for_seq,
+                stop_checker=StopChecker(
+                    self.scheduler_config.max_model_len,
+                    self.get_tokenizer_for_seq,
+                ),
+            ))
+
     def _initialize_kv_caches(self) -> None:
         """Initialize the KV cache in the worker(s).
 
@@ -414,240 +432,32 @@ def has_unfinished_requests(self) -> bool:
         """Returns True if there are unfinished requests."""
         return self.scheduler.has_unfinished_seqs()
 
-    def _check_beam_search_early_stopping(
-        self,
-        early_stopping: Union[bool, str],
-        sampling_params: SamplingParams,
-        best_running_seq: Sequence,
-        current_worst_seq: Sequence,
-    ) -> bool:
-        assert sampling_params.use_beam_search
-        length_penalty = sampling_params.length_penalty
-        if early_stopping is True:
-            return True
-
-        current_worst_score = current_worst_seq.get_beam_search_score(
-            length_penalty=length_penalty,
-            eos_token_id=current_worst_seq.eos_token_id)
-        if early_stopping is False:
-            highest_attainable_score = best_running_seq.get_beam_search_score(
-                length_penalty=length_penalty,
-                eos_token_id=best_running_seq.eos_token_id)
-        else:
-            assert early_stopping == "never"
-            if length_penalty > 0.0:
-                # If length_penalty > 0.0, beam search will prefer longer
-                # sequences. The highest attainable score calculation is
-                # based on the longest possible sequence length in this case.
-                max_possible_length = max(
-                    best_running_seq.get_prompt_len() +
-                    sampling_params.max_tokens,
-                    self.scheduler_config.max_model_len)
-                highest_attainable_score = (
-                    best_running_seq.get_beam_search_score(
-                        length_penalty=length_penalty,
-                        eos_token_id=best_running_seq.eos_token_id,
-                        seq_len=max_possible_length))
-            else:
-                # Otherwise, beam search will prefer shorter sequences. The
-                # highest attainable score calculation is based on the current
-                # sequence length.
-                highest_attainable_score = (
-                    best_running_seq.get_beam_search_score(
-                        length_penalty=length_penalty,
-                        eos_token_id=best_running_seq.eos_token_id))
-        return current_worst_score >= highest_attainable_score
-
-    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
-                                        outputs: SequenceGroupOutput) -> None:
-
-        # Process prompt logprobs
-        prompt_logprobs = outputs.prompt_logprobs
-        if prompt_logprobs is not None and seq_group.sampling_params.detokenize:
-            self.detokenizer.decode_prompt_logprobs_inplace(
-                seq_group, prompt_logprobs)
-            seq_group.prompt_logprobs = prompt_logprobs
-
-        # Process samples
-        samples = outputs.samples
-        parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
-        existing_finished_seqs = seq_group.get_finished_seqs()
-        parent_child_dict = {
-            parent_seq.seq_id: []
-            for parent_seq in parent_seqs
-        }
-        for sample in samples:
-            parent_child_dict[sample.parent_seq_id].append(sample)
-        # List of (child, parent)
-        child_seqs: List[Tuple[Sequence, Sequence]] = []
-
-        # Process the child samples for each parent sequence
-        for parent in parent_seqs:
-            child_samples: List[SequenceOutput] = parent_child_dict[
-                parent.seq_id]
-            if len(child_samples) == 0:
-                # This parent sequence has no children samples. Remove
-                # the parent sequence from the sequence group since it will
-                # not be used in the future iterations.
-                parent.status = SequenceStatus.FINISHED_ABORTED
-                seq_group.remove(parent.seq_id)
-                self.scheduler.free_seq(parent)
-                continue
-            # Fork the parent sequence if there are multiple child samples.
-            for child_sample in child_samples[:-1]:
-                new_child_seq_id = next(self.seq_counter)
-                child = parent.fork(new_child_seq_id)
-                child.append_token_id(child_sample.output_token,
-                                      child_sample.logprobs)
-                child_seqs.append((child, parent))
-            # Continue the parent sequence for the last child sample.
-            # We reuse the parent sequence here to reduce redundant memory
-            # copies, especially when using non-beam search sampling methods.
-            last_child_sample = child_samples[-1]
-            parent.append_token_id(last_child_sample.output_token,
-                                   last_child_sample.logprobs)
-            child_seqs.append((parent, parent))
-
-        for seq, _ in child_seqs:
-            if seq_group.sampling_params.detokenize:
-                new_char_count = self.detokenizer.decode_sequence_inplace(
-                    seq, seq_group.sampling_params)
-            else:
-                new_char_count = 0
-            self._check_stop(seq, new_char_count, seq_group.sampling_params)
-
-        # Non-beam search case
-        if not seq_group.sampling_params.use_beam_search:
-            # For newly created child sequences, add them to the sequence group
-            # and fork them in block manager if they are not finished.
-            for seq, parent in child_seqs:
-                if seq is not parent:
-                    seq_group.add(seq)
-                    if not seq.is_finished():
-                        self.scheduler.fork_seq(parent, seq)
-
-            # Free the finished and selected parent sequences' memory in block
-            # manager. Keep them in the sequence group as candidate output.
-            # NOTE: we need to fork the new sequences before freeing the
-            # old sequences.
-            for seq, parent in child_seqs:
-                if seq is parent and seq.is_finished():
-                    self.scheduler.free_seq(seq)
-            return
-
-        # Beam search case
-        # Select the child sequences to keep in the sequence group.
-        selected_child_seqs = []
-        unselected_child_seqs = []
-        beam_width = seq_group.sampling_params.best_of
-        length_penalty = seq_group.sampling_params.length_penalty
-
-        # Select the newly finished sequences with the highest scores
-        # to replace existing finished sequences.
-        # Tuple of (seq, parent, is_new)
-        existing_finished_seqs = [(seq, None, False)
-                                  for seq in existing_finished_seqs]
-        new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs
-                             if seq.is_finished()]
-        all_finished_seqs = existing_finished_seqs + new_finished_seqs
-        # Sort the finished sequences by their scores.
-        all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
-            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
-                               reverse=True)
-        for seq, parent, is_new in all_finished_seqs[:beam_width]:
-            if is_new:
-                # A newly generated child sequence finishes and has a high
-                # score, so we will add it into the sequence group.
-                selected_child_seqs.append((seq, parent))
-        for seq, parent, is_new in all_finished_seqs[beam_width:]:
-            if is_new:
-                # A newly generated child sequence finishes but has a low
-                # score, so we will not add it into the sequence group.
-                # Additionally, if this sequence is a continuation of a
-                # parent sequence, we will need remove the parent sequence
-                # from the sequence group.
-                unselected_child_seqs.append((seq, parent))
-            else:
-                # An existing finished sequence has a low score, so we will
-                # remove it from the sequence group.
-                seq_group.remove(seq.seq_id)
-
-        # select the top beam_width sequences from the running
-        # sequences for the next iteration to continue the beam
-        # search.
-        running_child_seqs = [(seq, parent) for seq, parent in child_seqs
-                              if not seq.is_finished()]
-        # Sort the running sequences by their scores.
-        running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
-            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
-                                reverse=True)
-
-        # Check if we can stop the beam search.
-        if len(running_child_seqs) == 0:
-            # No running sequences, stop the beam search.
-            stop_beam_search = True
-        elif len(all_finished_seqs) < beam_width:
-            # Not enough finished sequences, continue the beam search.
-            stop_beam_search = False
-        else:
-            # Check the early stopping criteria
-            best_running_seq = running_child_seqs[0][0]
-            current_worst_seq = all_finished_seqs[beam_width - 1][0]
-            stop_beam_search = self._check_beam_search_early_stopping(
-                seq_group.sampling_params.early_stopping,
-                seq_group.sampling_params, best_running_seq, current_worst_seq)
-
-        if stop_beam_search:
-            # Stop the beam search and remove all the running sequences from
-            # the sequence group.
-            unselected_child_seqs.extend(running_child_seqs)
-        else:
-            # Continue the beam search and select the top beam_width sequences
-            # to continue the beam search.
-            selected_child_seqs.extend(running_child_seqs[:beam_width])
-            # The remaining running sequences will not be used in the next
-            # iteration. Again, if these sequences are continuations of
-            # parent sequences, we will need to remove the parent sequences
-            # from the sequence group.
-            unselected_child_seqs.extend(running_child_seqs[beam_width:])
-
-        # For newly created child sequences, add them to the sequence group
-        # and fork them in block manager if they are not finished.
-        for seq, parent in selected_child_seqs:
-            if seq is not parent:
-                seq_group.add(seq)
-                if not seq.is_finished():
-                    self.scheduler.fork_seq(parent, seq)
-
-        # Free the finished and selected parent sequences' memory in block
-        # manager. Keep them in the sequence group as candidate output.
-        for seq, parent in selected_child_seqs:
-            if seq is parent and seq.is_finished():
-                self.scheduler.free_seq(seq)
-
-        # Remove the unselected parent sequences from the sequence group and
-        # free their memory in block manager.
-        for seq, parent in unselected_child_seqs:
-            if seq is parent:
-                # Remove the parent sequence if it is not selected for next
-                # iteration
-                seq_group.remove(seq.seq_id)
-                self.scheduler.free_seq(seq)
-
     def _process_model_outputs(
-            self, output: SamplerOutput,
-            scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
+            self, output: List[SamplerOutput],
+            scheduled_seq_groups: List[SequenceGroup],
+            ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]:
+        """Apply the model output to the sequences in the scheduled seq groups.
+        
+        Returns RequestOutputs that can be returned to the client.
+        """
+
         now = time.time()
+
+        # Organize outputs by [sequence group][step] instead of
+        # [step][sequence group].
+        output_by_sequence_group = create_output_by_sequence_group(
+            sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups))
+
         # Update the scheduled sequence groups with the model outputs.
-        scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
-        for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output):
+        for scheduled_seq_group, outputs in zip(scheduled_seq_groups,
+                                                output_by_sequence_group):
             seq_group = scheduled_seq_group.seq_group
             seq_group.update_num_computed_tokens(
                 scheduled_seq_group.token_chunk_size)
             # If uncomputed tokens > 0, it means prefill is chunked.
             # We don't need to process outputs in that case.
             if seq_group.get_num_uncomputed_tokens() == 0:
-                self._process_sequence_group_outputs(seq_group, outputs)
+                self.output_processor.process_outputs(seq_group, outputs)
 
         # Free the finished sequence groups.
         self.scheduler.free_finished_seq_groups()
@@ -659,13 +469,9 @@ def _process_model_outputs(
             seq_group.maybe_set_first_token_time(now)
             request_output = RequestOutput.from_seq_group(seq_group)
             request_outputs.append(request_output)
-        for seq_group in scheduler_outputs.ignored_seq_groups:
+        for seq_group in ignored_seq_groups:
             request_output = RequestOutput.from_seq_group(seq_group)
             request_outputs.append(request_output)
-
-        # Log stats.
-        if self.log_stats:
-            self.stat_logger.log(self._get_stats(scheduler_outputs))
         return request_outputs
 
     def step(self) -> List[RequestOutput]:
@@ -723,13 +529,23 @@ def step(self) -> List[RequestOutput]:
 
         if not scheduler_outputs.is_empty():
             output = self.model_executor.execute_model(
-                seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in,
-                scheduler_outputs.blocks_to_swap_out,
-                scheduler_outputs.blocks_to_copy)
+                seq_group_metadata_list=seq_group_metadata_list,
+                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+                blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                num_lookahead_slots=scheduler_outputs.num_lookahead_slots)
         else:
             output = []
 
-        return self._process_model_outputs(output, scheduler_outputs)
+        request_outputs = self._process_model_outputs(
+            output, scheduler_outputs.scheduled_seq_groups,
+            scheduler_outputs.ignored_seq_groups)
+
+        # Log stats.
+        if self.log_stats:
+            self.stat_logger.log(self._get_stats(scheduler_outputs))
+
+        return request_outputs
 
     def do_log_stats(self) -> None:
         """Forced log when no requests active."""
@@ -809,87 +625,6 @@ def _get_stats(self,
             time_e2e_requests=time_e2e_requests,
         )
 
-    def _check_stop(self, seq: Sequence, new_char_count: int,
-                    sampling_params: SamplingParams) -> None:
-        """Stop the finished sequences.
-
-       new_char_count is the number of chars added to the
-           sequence's output text for the newly generated token
-        """
-
-        # Check if the minimum number of tokens has been generated yet;
-        # skip the stop string/token checks if not
-        if seq.get_output_len() < sampling_params.min_tokens:
-            return
-
-        # Check if the sequence has generated the EOS token.
-        if ((not sampling_params.ignore_eos)
-                and seq.get_last_token_id() == seq.eos_token_id):
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            return
-
-        # Check if a stop token was encountered.
-        # This assumes a single token produced per step.
-        last_token_id = seq.get_last_token_id()
-        if last_token_id in sampling_params.stop_token_ids:
-            if new_char_count and (
-                    not sampling_params.include_stop_str_in_output):
-                # Remove last token
-                seq.output_text = seq.output_text[:-new_char_count]
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            seq.stop_reason = last_token_id
-            return
-
-        # Check if any stop strings are matched.
-        stop_str = self._check_stop_strings(seq, new_char_count,
-                                            sampling_params)
-        if stop_str is not None:
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            seq.stop_reason = stop_str
-            return
-
-        # Check if the sequence has reached max_model_len.
-        if seq.get_len() > self.scheduler_config.max_model_len:
-            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
-            return
-
-        # Check if the sequence has reached max_tokens.
-        if seq.get_output_len() == sampling_params.max_tokens:
-            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
-            return
-
-    @staticmethod
-    def _check_stop_strings(seq: Sequence, new_char_count: int,
-                            sampling_params: SamplingParams) -> Optional[str]:
-        """Check if any stop strings are matched and truncate sequence
-        output text accordingly.
-
-        Returns the stop string if matched or else None.
-        """
-        if not new_char_count:
-            return None
-
-        for stop_str in sampling_params.stop:
-            stop_string_len = len(stop_str)
-            # Avoid searching already-searched text.
-            stop_index = seq.output_text.find(
-                stop_str, -new_char_count - stop_string_len)
-            if stop_index == -1:
-                continue
-
-            if sampling_params.include_stop_str_in_output:
-                # Truncate to end of stop string.
-                stop_index += stop_string_len
-                if stop_index >= len(seq.output_text):
-                    # No truncation required.
-                    return stop_str
-
-            # Truncate the output text to either the beginning
-            # or end of the stop string.
-            seq.output_text = seq.output_text[:stop_index]
-            return stop_str
-        return None
-
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_executor.add_lora(lora_request)
 
diff --git a/vllm/engine/output_processor/__init__.py b/vllm/engine/output_processor/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
new file mode 100644
index 0000000000000..9ddac7a04cb36
--- /dev/null
+++ b/vllm/engine/output_processor/interfaces.py
@@ -0,0 +1,69 @@
+from abc import ABC, abstractmethod
+from typing import Callable, Iterable, List
+
+from transformers import PreTrainedTokenizer
+
+from vllm.config import SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
+from vllm.transformers_utils.detokenizer import Detokenizer
+
+
+class SequenceGroupOutputProcessor(ABC):
+    """Interface for logic that processes new token ids in sequence groups,
+    managing detokenization, stop checking, and freeing/forking sequences with
+    the scheduler.
+
+    This is highly coupled with the LLMEngine and should be seen as an extension
+    of it. The logic is separated to simplify the LLMEngine class and allow
+    separate implementations for single-step decoding (which supports beam
+    search sequence forking) and multi-step decoding (which does not support
+    beam search, but does support speculative decoding).
+    """
+
+    @staticmethod
+    def create_output_processor(
+        scheduler_config: SchedulerConfig,
+        detokenizer: Detokenizer,
+        scheduler: Scheduler,
+        seq_counter: Iterable[int],
+        get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
+        stop_checker: "StopChecker",
+    ):
+        """Create an output processor.
+
+        This returns a single-step output processor if num_lookahead_slots is
+        zero, else returns a multi-step output processor.
+        """
+        if scheduler_config.num_lookahead_slots == 0:
+            # Importing here to avoid cycle.
+            from vllm.engine.output_processor.single_step import (
+                SingleStepOutputProcessor)
+            return SingleStepOutputProcessor(
+                scheduler_config,
+                detokenizer,
+                scheduler,
+                seq_counter,
+                stop_checker,
+            )
+        else:
+            # Importing here to avoid cycle.
+            from vllm.engine.output_processor.multi_step import (
+                MultiStepOutputProcessor)
+            return MultiStepOutputProcessor(
+                detokenizer,
+                scheduler,
+                seq_counter,
+                get_tokenizer_for_seq,
+                stop_checker,
+            )
+
+    @abstractmethod
+    def process_outputs(self, sequence_group: SequenceGroup,
+                        outputs: List[SequenceGroupOutput]) -> None:
+        """Process new token ids for the sequence group. Handles logic such as
+        detokenization, stop checking, and freeing/forking sequences in the
+        scheduler.
+        """
+        pass
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
new file mode 100644
index 0000000000000..50da0d35fcec1
--- /dev/null
+++ b/vllm/engine/output_processor/multi_step.py
@@ -0,0 +1,126 @@
+from typing import Callable, Iterable, List
+
+from transformers import PreTrainedTokenizer
+
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (
+    SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Logprob, Sequence, SequenceGroup,
+                           SequenceGroupOutput, SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+
+logger = init_logger(__name__)
+
+
+class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+    """SequenceGroupOutputProcessor which handles logic related to
+    detokenization and stopping conditions. It specializes to "multi-step
+    decoding", where vLLM's worker may generate multiple tokens per invocation.
+    This is currently mutually exclusive with advanced sampling techniques like
+    beam search, which motivates the separation of this logic from the single
+    step output processor.
+
+    This class is responsible for things such as correctly appending all new
+    token ids to their sequence, detokenizing new token ids, truncating new
+    output tokens after an eos token, and correctly handling the case where the
+    number of new output tokens per sequence differs in a single batch.
+    """
+
+    def __init__(
+        self,
+        detokenizer: Detokenizer,
+        scheduler: Scheduler,
+        seq_counter: Iterable[int],
+        get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
+        stop_checker: StopChecker,
+    ):
+        self.detokenizer = detokenizer
+        self.scheduler = scheduler
+        self.seq_counter = seq_counter
+        self.get_tokenizer_for_seq = get_tokenizer_for_seq
+        self.stop_checker = stop_checker
+
+    def process_outputs(self, sequence_group: SequenceGroup,
+                        outputs: List[SequenceGroupOutput]) -> None:
+        """Append new tokens in the outputs to sequences in the sequence group.
+
+        This only supports sequence groups of size 1. It supports greater than
+        one new token per sequence.
+
+        This applies logic like stop condition checking and detokenization,
+        including freeing finished sequences. It also handles cases where there
+        are tokens emitted after the EOS token.
+        """
+        seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
+
+        assert seqs, "expected running sequences"
+        assert len(seqs) == 1, (
+            "Beam search not supported in multi-step decoding.")
+        seq = seqs[0]
+
+        # Since there's only one sequence per sequence group, we can take the
+        # first sample.
+        samples = [outputs[step].samples[0] for step in range(len(outputs))]
+
+        # -1 means the output token is not valid (eg. due to spec decode
+        # rejecting tokens).
+        valid_samples = [
+            sample for sample in samples if sample.output_token != -1
+        ]
+        assert valid_samples
+
+        self._process_seq_outputs(seq, valid_samples,
+                                  sequence_group.sampling_params)
+
+    def _process_seq_outputs(self, seq: Sequence,
+                             valid_samples: List[SequenceOutput],
+                             sampling_params: SamplingParams) -> None:
+        output_token_ids = [sample.output_token for sample in valid_samples]
+
+        # Truncate to max_tokens if necessary.
+        remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
+                                                         len(output_token_ids))
+        if remaining_tokens < 0:
+            valid_samples = valid_samples[:remaining_tokens]
+            output_token_ids = output_token_ids[:remaining_tokens]
+
+        # Truncate any tokens after EOS. This is required as spec decode
+        # generates a fixed number of tokens without evaluating stopping
+        # conditions within the block. This can cause an eos token to be
+        # unintentionally ignored.
+        if not sampling_params.ignore_eos:
+            eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
+            # Avoiding .index calls as exception throwing in the happy path
+            # is expensive.
+            for i in range(len(output_token_ids)):
+                if output_token_ids[i] == eos_token_id:
+                    output_token_ids = output_token_ids[:i + 1]
+                    valid_samples = valid_samples[:i + 1]
+                    break
+
+        # Incrementally append tokens to the sequence, as if we had only one new
+        # token.
+        for output_token_id in output_token_ids:
+            seq.append_token_id(
+                token_id=output_token_id,
+                # TODO emit logprobs in multi-step decoding.
+                logprobs={output_token_id: Logprob(0.0)},
+            )
+
+            new_char_count = 0
+            if sampling_params.detokenize:
+                new_char_count = self.detokenizer.decode_sequence_inplace(
+                    seq, sampling_params)
+
+            self.stop_checker.maybe_stop_sequence(
+                seq,
+                new_char_count=new_char_count,
+                sampling_params=sampling_params)
+            if seq.is_finished():
+                break
+
+        if seq.is_finished():
+            self.scheduler.free_seq(seq)
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
new file mode 100644
index 0000000000000..1b7eb014f802b
--- /dev/null
+++ b/vllm/engine/output_processor/single_step.py
@@ -0,0 +1,276 @@
+from typing import Iterable, List, Tuple, Union
+
+from vllm.config import SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (
+    SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
+                           SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+
+logger = init_logger(__name__)
+
+
+class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
+    """SequenceGroupOutputProcessor which handles "output processing" logic,
+    which happens after the model returns generated token ids and before
+    scheduling of the next batch. Output processing logic includes
+    detokenization, and determining if a sequence is finished (e.g. via max len
+    or eos token).
+
+    The SingleStepOutputProcessor is specialized to the case where the model
+    emits at most a single token per invocation, which precludes configurations
+    such as speculative decoding or multi-step decoding. This enables beam
+    search sampling, which requires forking/finishing/freeing sequences in a way
+    that is currently difficult to schedule multiple steps ahead of time.
+    """
+
+    def __init__(
+        self,
+        scheduler_config: SchedulerConfig,
+        detokenizer: Detokenizer,
+        scheduler: Scheduler,
+        seq_counter: Iterable[int],
+        stop_checker: StopChecker,
+    ):
+        self.scheduler_config = scheduler_config
+        self.detokenizer = detokenizer
+        self.scheduler = scheduler
+        self.seq_counter = seq_counter
+        self.stop_checker = stop_checker
+
+    def process_outputs(self, sequence_group: SequenceGroup,
+                        outputs: List[SequenceGroupOutput]) -> None:
+        """Append all new tokens to sequences in the sequence group. Fork any
+        surviving beam candidates; free any unsurviving ones.
+
+        Invokes detokenizer to detokenize new tokens, and also marks sequences
+        as finished if they meet stop conditions.
+        """
+        assert (len(outputs) == 1
+                ), f"{type(self)} does not support multiple outputs per step"
+        return self._process_sequence_group_outputs(sequence_group, outputs[0])
+
+    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
+                                        outputs: SequenceGroupOutput) -> None:
+
+        # Process prompt logprobs
+        prompt_logprobs = outputs.prompt_logprobs
+        if prompt_logprobs is not None and seq_group.sampling_params.detokenize:
+            self.detokenizer.decode_prompt_logprobs_inplace(
+                seq_group, prompt_logprobs)
+            seq_group.prompt_logprobs = prompt_logprobs
+
+        # Process samples
+        samples = outputs.samples
+        parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+        existing_finished_seqs = seq_group.get_finished_seqs()
+        parent_child_dict = {
+            parent_seq.seq_id: []
+            for parent_seq in parent_seqs
+        }
+        for sample in samples:
+            parent_child_dict[sample.parent_seq_id].append(sample)
+        # List of (child, parent)
+        child_seqs: List[Tuple[Sequence, Sequence]] = []
+
+        # Process the child samples for each parent sequence
+        for parent in parent_seqs:
+            child_samples: List[SequenceOutput] = parent_child_dict[
+                parent.seq_id]
+            if len(child_samples) == 0:
+                # This parent sequence has no children samples. Remove
+                # the parent sequence from the sequence group since it will
+                # not be used in the future iterations.
+                parent.status = SequenceStatus.FINISHED_ABORTED
+                seq_group.remove(parent.seq_id)
+                self.scheduler.free_seq(parent)
+                continue
+            # Fork the parent sequence if there are multiple child samples.
+            for child_sample in child_samples[:-1]:
+                new_child_seq_id = next(self.seq_counter)
+                child = parent.fork(new_child_seq_id)
+                child.append_token_id(child_sample.output_token,
+                                      child_sample.logprobs)
+                child_seqs.append((child, parent))
+            # Continue the parent sequence for the last child sample.
+            # We reuse the parent sequence here to reduce redundant memory
+            # copies, especially when using non-beam search sampling methods.
+            last_child_sample = child_samples[-1]
+            parent.append_token_id(last_child_sample.output_token,
+                                   last_child_sample.logprobs)
+            child_seqs.append((parent, parent))
+
+        for seq, _ in child_seqs:
+            if seq_group.sampling_params.detokenize:
+                new_char_count = self.detokenizer.decode_sequence_inplace(
+                    seq, seq_group.sampling_params)
+            else:
+                new_char_count = 0
+            self.stop_checker.maybe_stop_sequence(seq, new_char_count,
+                                                  seq_group.sampling_params)
+
+        # Non-beam search case
+        if not seq_group.sampling_params.use_beam_search:
+            # For newly created child sequences, add them to the sequence group
+            # and fork them in block manager if they are not finished.
+            for seq, parent in child_seqs:
+                if seq is not parent:
+                    seq_group.add(seq)
+                    if not seq.is_finished():
+                        self.scheduler.fork_seq(parent, seq)
+
+            # Free the finished and selected parent sequences' memory in block
+            # manager. Keep them in the sequence group as candidate output.
+            # NOTE: we need to fork the new sequences before freeing the
+            # old sequences.
+            for seq, parent in child_seqs:
+                if seq is parent and seq.is_finished():
+                    self.scheduler.free_seq(seq)
+            return
+
+        # Beam search case
+        # Select the child sequences to keep in the sequence group.
+        selected_child_seqs = []
+        unselected_child_seqs = []
+        beam_width = seq_group.sampling_params.best_of
+        length_penalty = seq_group.sampling_params.length_penalty
+
+        # Select the newly finished sequences with the highest scores
+        # to replace existing finished sequences.
+        # Tuple of (seq, parent, is_new)
+        existing_finished_seqs = [(seq, None, False)
+                                  for seq in existing_finished_seqs]
+        new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs
+                             if seq.is_finished()]
+        all_finished_seqs = existing_finished_seqs + new_finished_seqs
+        # Sort the finished sequences by their scores.
+        all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
+                               reverse=True)
+        for seq, parent, is_new in all_finished_seqs[:beam_width]:
+            if is_new:
+                # A newly generated child sequence finishes and has a high
+                # score, so we will add it into the sequence group.
+                selected_child_seqs.append((seq, parent))
+        for seq, parent, is_new in all_finished_seqs[beam_width:]:
+            if is_new:
+                # A newly generated child sequence finishes but has a low
+                # score, so we will not add it into the sequence group.
+                # Additionally, if this sequence is a continuation of a
+                # parent sequence, we will need remove the parent sequence
+                # from the sequence group.
+                unselected_child_seqs.append((seq, parent))
+            else:
+                # An existing finished sequence has a low score, so we will
+                # remove it from the sequence group.
+                seq_group.remove(seq.seq_id)
+
+        # select the top beam_width sequences from the running
+        # sequences for the next iteration to continue the beam
+        # search.
+        running_child_seqs = [(seq, parent) for seq, parent in child_seqs
+                              if not seq.is_finished()]
+        # Sort the running sequences by their scores.
+        running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
+                                reverse=True)
+
+        # Check if we can stop the beam search.
+        if len(running_child_seqs) == 0:
+            # No running sequences, stop the beam search.
+            stop_beam_search = True
+        elif len(all_finished_seqs) < beam_width:
+            # Not enough finished sequences, continue the beam search.
+            stop_beam_search = False
+        else:
+            # Check the early stopping criteria
+            best_running_seq = running_child_seqs[0][0]
+            current_worst_seq = all_finished_seqs[beam_width - 1][0]
+            stop_beam_search = self._check_beam_search_early_stopping(
+                seq_group.sampling_params.early_stopping,
+                seq_group.sampling_params, best_running_seq, current_worst_seq)
+
+        if stop_beam_search:
+            # Stop the beam search and remove all the running sequences from
+            # the sequence group.
+            unselected_child_seqs.extend(running_child_seqs)
+        else:
+            # Continue the beam search and select the top beam_width sequences
+            # to continue the beam search.
+            selected_child_seqs.extend(running_child_seqs[:beam_width])
+            # The remaining running sequences will not be used in the next
+            # iteration. Again, if these sequences are continuations of
+            # parent sequences, we will need to remove the parent sequences
+            # from the sequence group.
+            unselected_child_seqs.extend(running_child_seqs[beam_width:])
+
+        # For newly created child sequences, add them to the sequence group
+        # and fork them in block manager if they are not finished.
+        for seq, parent in selected_child_seqs:
+            if seq is not parent:
+                seq_group.add(seq)
+                if not seq.is_finished():
+                    self.scheduler.fork_seq(parent, seq)
+
+        # Free the finished and selected parent sequences' memory in block
+        # manager. Keep them in the sequence group as candidate output.
+        for seq, parent in selected_child_seqs:
+            if seq is parent and seq.is_finished():
+                self.scheduler.free_seq(seq)
+
+        # Remove the unselected parent sequences from the sequence group and
+        # free their memory in block manager.
+        for seq, parent in unselected_child_seqs:
+            if seq is parent:
+                # Remove the parent sequence if it is not selected for next
+                # iteration
+                seq_group.remove(seq.seq_id)
+                self.scheduler.free_seq(seq)
+
+    def _check_beam_search_early_stopping(
+        self,
+        early_stopping: Union[bool, str],
+        sampling_params: SamplingParams,
+        best_running_seq: Sequence,
+        current_worst_seq: Sequence,
+    ) -> bool:
+        assert sampling_params.use_beam_search
+        length_penalty = sampling_params.length_penalty
+        if early_stopping is True:
+            return True
+
+        current_worst_score = current_worst_seq.get_beam_search_score(
+            length_penalty=length_penalty,
+            eos_token_id=current_worst_seq.eos_token_id)
+        if early_stopping is False:
+            highest_attainable_score = best_running_seq.get_beam_search_score(
+                length_penalty=length_penalty,
+                eos_token_id=best_running_seq.eos_token_id)
+        else:
+            assert early_stopping == "never"
+            if length_penalty > 0.0:
+                # If length_penalty > 0.0, beam search will prefer longer
+                # sequences. The highest attainable score calculation is
+                # based on the longest possible sequence length in this case.
+                max_possible_length = max(
+                    best_running_seq.get_prompt_len() +
+                    sampling_params.max_tokens,
+                    self.scheduler_config.max_model_len)
+                highest_attainable_score = (
+                    best_running_seq.get_beam_search_score(
+                        length_penalty=length_penalty,
+                        eos_token_id=best_running_seq.eos_token_id,
+                        seq_len=max_possible_length))
+            else:
+                # Otherwise, beam search will prefer shorter sequences. The
+                # highest attainable score calculation is based on the current
+                # sequence length.
+                highest_attainable_score = (
+                    best_running_seq.get_beam_search_score(
+                        length_penalty=length_penalty,
+                        eos_token_id=best_running_seq.eos_token_id))
+        return current_worst_score >= highest_attainable_score
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
new file mode 100644
index 0000000000000..66deb9b591746
--- /dev/null
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -0,0 +1,101 @@
+from typing import Callable, Optional
+
+from transformers import PreTrainedTokenizer
+
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import Sequence, SequenceStatus
+
+
+class StopChecker:
+    """LLMEngine helper class which separates out the logic involving stop
+    checking. This checks things such as: whether the eos token was emitted,
+    whether the max_tokens has been consumed, whether a stop string has been
+    emitted, or if we have exceeded the max model len.
+    """
+
+    def __init__(self, max_model_len: int,
+                 get_tokenizer_for_seq: Callable[[Sequence],
+                                                 PreTrainedTokenizer]):
+        self.max_model_len = max_model_len
+        self.get_tokenizer_for_seq = get_tokenizer_for_seq
+
+    def maybe_stop_sequence(self, seq: Sequence, new_char_count: int,
+                            sampling_params: SamplingParams) -> None:
+        """Stop the finished sequences.
+
+       new_char_count is the number of chars added to the
+           sequence's output text for the newly generated token
+        """
+
+        # Check if the minimum number of tokens has been generated yet;
+        # skip the stop string/token checks if not
+        if seq.get_output_len() < sampling_params.min_tokens:
+            return
+
+        # Check if the sequence has generated the EOS token.
+        if ((not sampling_params.ignore_eos)
+                and seq.get_last_token_id() == seq.eos_token_id):
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            return
+
+        # Check if a stop token was encountered.
+        # This assumes a single token produced per step.
+        last_token_id = seq.get_last_token_id()
+        if last_token_id in sampling_params.stop_token_ids:
+            if new_char_count and (
+                    not sampling_params.include_stop_str_in_output):
+                # Remove last token
+                seq.output_text = seq.output_text[:-new_char_count]
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            seq.stop_reason = last_token_id
+            return
+
+        # Check if any stop strings are matched.
+        stop_str = self._check_stop_strings(seq, new_char_count,
+                                            sampling_params)
+        if stop_str is not None:
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            seq.stop_reason = stop_str
+            return
+
+        # Check if the sequence has reached max_model_len.
+        if seq.get_len() > self.max_model_len:
+            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+            return
+
+        # Check if the sequence has reached max_tokens.
+        if seq.get_output_len() == sampling_params.max_tokens:
+            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+            return
+
+    @staticmethod
+    def _check_stop_strings(seq: Sequence, new_char_count: int,
+                            sampling_params: SamplingParams) -> Optional[str]:
+        """Check if any stop strings are matched and truncate sequence
+        output text accordingly.
+
+        Returns the stop string if matched or else None.
+        """
+        if not new_char_count:
+            return None
+
+        for stop_str in sampling_params.stop:
+            stop_string_len = len(stop_str)
+            # Avoid searching already-searched text.
+            stop_index = seq.output_text.find(
+                stop_str, -new_char_count - stop_string_len)
+            if stop_index == -1:
+                continue
+
+            if sampling_params.include_stop_str_in_output:
+                # Truncate to end of stop string.
+                stop_index += stop_string_len
+                if stop_index >= len(seq.output_text):
+                    # No truncation required.
+                    return stop_str
+
+            # Truncate the output text to either the beginning
+            # or end of the stop string.
+            seq.output_text = seq.output_text[:stop_index]
+            return stop_str
+        return None
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
new file mode 100644
index 0000000000000..5fbb09a857a46
--- /dev/null
+++ b/vllm/engine/output_processor/util.py
@@ -0,0 +1,16 @@
+from typing import List
+
+from vllm.sequence import SamplerOutput
+
+
+def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput],
+                                    num_seq_groups: int):
+    """Helper method which transforms a 2d list organized by
+    [step][sequence group] into [sequence group][step].
+    """
+    output_by_sequence_group = [[] for _ in range(num_seq_groups)]
+    for step in sampler_outputs:
+        for i, sequence_group_output in enumerate(step):
+            output_by_sequence_group[i].append(sequence_group_output)
+
+    return output_by_sequence_group
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 426e2c41d8427..f925a6fc93dcd 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -74,7 +74,8 @@ def execute_model(self,
                       seq_group_metadata_list: List[SequenceGroupMetadata],
                       blocks_to_swap_in: Dict[int, int],
                       blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+                      blocks_to_copy: Dict[int, List[int]],
+                      num_lookahead_slots: int) -> List[SamplerOutput]:
         output = self.driver_worker.execute_model(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 8cc04c5299ca1..1839b5603ff3e 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -72,8 +72,9 @@ def execute_model(self,
                       seq_group_metadata_list: List[SequenceGroupMetadata],
                       blocks_to_swap_in: Dict[int, int],
                       blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
-        """Executes one model step on the given sequences."""
+                      blocks_to_copy: Dict[int, List[int]],
+                      num_lookahead_slots: int) -> List[SamplerOutput]:
+        """Executes at least one model step on the given sequences."""
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 3a9537effe6d9..6e4a765e2ffd5 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -13,13 +13,17 @@
 class GPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
-        assert (not self.speculative_config
-                ), "Speculative decoding not yet supported for GPU backend"
+        """Initialize the worker and load the model.
 
-        # Instantiate the worker and load the model to GPU.
-        self._init_worker()
+        If speculative decoding is enabled, we instead create the speculative
+        worker.
+        """
+        if self.speculative_config is None:
+            self._init_non_spec_worker()
+        else:
+            self._init_spec_worker()
 
-    def _init_worker(self):
+    def _init_non_spec_worker(self):
         # Lazy import the Worker to avoid importing torch.cuda/xformers
         # before CUDA_VISIBLE_DEVICES is set in the Worker
         from vllm.worker.worker import Worker
@@ -46,6 +50,57 @@ def _init_worker(self):
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
+    def _init_spec_worker(self):
+        """Initialize a SpecDecodeWorker, using a draft model for proposals.
+        """
+        assert self.speculative_config is not None
+
+        from vllm.spec_decode.multi_step_worker import MultiStepWorker
+        from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
+        from vllm.worker.worker import Worker
+
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+
+        target_worker = Worker(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            is_driver_worker=True,
+        )
+
+        draft_worker = MultiStepWorker(
+            model_config=self.speculative_config.draft_model_config,
+            parallel_config=self.speculative_config.draft_parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            is_driver_worker=True,
+        )
+
+        spec_decode_worker = SpecDecodeWorker.from_workers(
+            proposer_worker=draft_worker, scorer_worker=target_worker)
+
+        assert self.parallel_config.world_size == 1, (
+            "GPUExecutor only supports single GPU.")
+
+        self.driver_worker = spec_decode_worker
+
+        # Load model handled in spec decode worker.
+        self.driver_worker.init_device()
+
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
         underlying worker.
@@ -63,16 +118,20 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
 
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
-    def execute_model(self,
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      blocks_to_swap_in: Dict[int, int],
-                      blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        num_lookahead_slots: int,
+    ) -> List[SamplerOutput]:
         output = self.driver_worker.execute_model(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy,
+            num_lookahead_slots=num_lookahead_slots,
         )
         return output
 
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 273b17a927efd..7cc187e297c9f 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -48,10 +48,13 @@ def execute_model(self,
                       seq_group_metadata_list: List[SequenceGroupMetadata],
                       blocks_to_swap_in: Dict[int, int],
                       blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+                      blocks_to_copy: Dict[int, List[int]],
+                      num_lookahead_slots: int) -> List[SamplerOutput]:
         assert (blocks_to_swap_in == {} and blocks_to_swap_out == {}
                 and blocks_to_copy == {}), (
                     "Cache operations are not supported for Neuron backend.")
+        assert num_lookahead_slots == 0, (
+            "lookahead not supported for Neuron backend.")
 
         output = self.driver_worker.execute_model(
             seq_group_metadata_list=seq_group_metadata_list)
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 4065c0868d79a..5f859fdc9c078 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -242,7 +242,8 @@ def execute_model(self,
                       seq_group_metadata_list: List[SequenceGroupMetadata],
                       blocks_to_swap_in: Dict[int, int],
                       blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+                      blocks_to_copy: Dict[int, List[int]],
+                      num_lookahead_slots: int = 0) -> SamplerOutput:
         all_outputs = self._run_workers(
             "execute_model",
             driver_kwargs={
diff --git a/vllm/sequence.py b/vllm/sequence.py
index dcde81df19923..92362a9a5d2a3 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -693,3 +693,16 @@ def __len__(self):
     def __eq__(self, other: object):
         return isinstance(other,
                           self.__class__) and self.outputs == other.outputs
+
+    def __repr__(self) -> str:
+        """Show the shape of a tensor instead of its values to reduce noise.
+        """
+        sampled_token_probs_repr = ("None" if self.sampled_token_probs is None
+                                    else self.sampled_token_probs.shape)
+        sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else
+                                  self.sampled_token_ids.shape)
+        return (
+            f"SamplerOutput(outputs={self.outputs}, "
+            f"sampled_token_probs={sampled_token_probs_repr}, "
+            f"sampled_token_ids={sampled_token_ids_repr}, "
+            f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})")
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index e0b75837e8a39..88af1dd360155 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -6,10 +6,10 @@
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
-from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
-                                   sampler_output_to_torch,
+from vllm.spec_decode.util import (get_all_seq_ids, maybe_mock_device_tensors,
+                                   nvtx_range, sampler_output_to_torch,
                                    split_batch_by_proposal_len)
-from vllm.worker.worker import Worker
+from vllm.worker.worker_base import WorkerBase
 
 SeqId = int
 TargetSeqId = int
@@ -31,7 +31,8 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
     of topk/tree.
     """
 
-    def __init__(self, scorer_worker: Worker, device: str, vocab_size: int):
+    def __init__(self, scorer_worker: WorkerBase, device: str,
+                 vocab_size: int):
         self._scorer_worker = scorer_worker
         self._device = device
         self._vocab_size = vocab_size
@@ -83,7 +84,9 @@ def score_proposals(
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy,
-            return_python_output=False)
+        )
+        assert len(target_sampler_output) == 1, "expected single-step output"
+        target_sampler_output = target_sampler_output[0]
 
         all_tokens, all_probs = self._contract_batch(
             original_bs=len(seq_group_metadata_list),
@@ -142,6 +145,16 @@ def _contract_batch(self, original_bs: int,
         This maps the scores of speculative tokens back to their original
         sequences.
         """
+
+        # We mock the device tensors until PR 7/9 is merged (e2e correctness).
+        # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer
+        maybe_mock_device_tensors(
+            sampler_output=target_sampler_output,
+            batch_size=len(non_spec_indices) + num_scoring_tokens,
+            vocab_size=self._vocab_size,
+            device=self._device,
+        )
+
         (target_token_ids, target_probs, non_spec_target_token_ids,
          non_spec_target_probs) = self._split_scoring_output(
              target_sampler_output, num_scoring_tokens)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 73b6e201c67a9..ce63c329a40aa 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -6,7 +6,8 @@
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeProposer)
-from vllm.spec_decode.util import sampler_output_to_torch
+from vllm.spec_decode.util import (maybe_mock_device_tensors,
+                                   sampler_output_to_torch)
 from vllm.worker.worker import Worker
 
 
@@ -69,6 +70,9 @@ def execute_model_multi_step(
                 blocks_to_swap_out=blocks_to_swap_out,
                 blocks_to_copy=blocks_to_copy,
             )
+            assert (len(model_output) == 1
+                    ), "composing multistep workers not supported"
+            model_output = model_output[0]
 
             self._append_new_tokens(model_output,
                                     copied_seq_group_metadata_list)
@@ -341,6 +345,16 @@ def _merge_outputs(
 
         sampler_output = maybe_sampler_output
 
+        # We mock the device tensors until PR 7/9 is merged (e2e correctness).
+        # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer
+        for step_output in sampler_output:
+            maybe_mock_device_tensors(
+                sampler_output=step_output,
+                batch_size=len(proposal_lens),
+                vocab_size=self._vocab_size,
+                device=self._device,
+            )
+
         proposal_tokens, proposal_probs = sampler_output_to_torch(
             sampler_output)
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 885bf537568e3..be3af7be93864 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -3,8 +3,9 @@
 
 import torch
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import (SamplerOutput, SequenceGroupMetadata,
+from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata,
                            SequenceGroupOutput, SequenceOutput)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
@@ -13,8 +14,9 @@
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
                                    split_batch_by_proposal_len)
-from vllm.worker.worker import Worker
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
+
+logger = init_logger(__name__)
 
 
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
@@ -45,10 +47,20 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
         More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit.
     """
 
+    @classmethod
+    def from_workers(cls, proposer_worker: MultiStepWorker,
+                     scorer_worker: WorkerBase) -> "SpecDecodeWorker":
+        return SpecDecodeWorker(
+            proposer_worker,
+            scorer_worker,
+            # TODO(cade) disable strict mode for speedup.
+            rejection_sampler=RejectionSampler(strict_mode=True),
+        )
+
     def __init__(
         self,
         proposer_worker: MultiStepWorker,
-        scorer_worker: Worker,
+        scorer_worker: WorkerBase,
         rejection_sampler: RejectionSampler,
         metrics_collector: Optional[AsyncMetricsCollector] = None,
     ):
@@ -87,6 +99,10 @@ def init_device(self) -> None:
         self.scorer_worker.init_device()
         self.proposer_worker.init_device()
 
+        # NOTE(cade): load_model is not part of the WorkerBase interface.
+        self.scorer_worker.load_model()
+        self.proposer_worker.load_model()
+
         self._metrics.init_gpu_tensors(self.rank)
         self.rejection_sampler.init_gpu_tensors(self.rank)
         self.scorer = BatchExpansionTop1Scorer(
@@ -131,7 +147,7 @@ def execute_model(
         blocks_to_swap_in: Optional[Dict[int, int]],
         blocks_to_swap_out: Optional[Dict[int, int]],
         blocks_to_copy: Optional[Dict[int, List[int]]],
-        num_spec_tokens: int,
+        num_lookahead_slots: int,
     ) -> List[SamplerOutput]:
         """Perform speculative decoding on the input batch.
         """
@@ -140,9 +156,11 @@ def execute_model(
             "speculative decoding "
             "requires non-None seq_group_metadata_list")
 
+        logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}")
+
         # If no spec tokens, call the proposer and scorer workers normally.
         # Used for prefill.
-        if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0:
+        if num_lookahead_slots == 0 or len(seq_group_metadata_list) == 0:
             return self._run_no_spec(
                 seq_group_metadata_list=seq_group_metadata_list,
                 blocks_to_swap_in=blocks_to_swap_in,
@@ -155,7 +173,7 @@ def execute_model(
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy,
-            k=num_spec_tokens,
+            k=num_lookahead_slots,
         )
 
     @nvtx_range("spec_decode_worker._run_no_spec")
@@ -170,20 +188,24 @@ def _run_no_spec(
         proposer and scorer model so that the KV cache is consistent between the
         two.
         """
+        logger.info("run proposer worker no spec")
 
         self.proposer_worker.execute_model(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy,
-            return_python_output=False)
+        )
 
+        logger.info("run target worker no spec")
         sampler_output = self.scorer_worker.execute_model(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy,
         )
+        assert len(sampler_output) == 1
+        sampler_output = sampler_output[0]
 
         # Clear device tensors from sampler output. This reduces communication
         # overhead when the engine runs in a different process than the workers.
@@ -209,11 +231,13 @@ def _run_speculative_decoding_step(
         sequence.
         """
 
+        logger.info("get spec proposals")
         # Generate proposals using draft worker.
         proposals = self.proposer_worker.get_spec_proposals(
             seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
             blocks_to_copy, k)
 
+        logger.info("score proposals")
         proposal_scores = self.scorer.score_proposals(
             seq_group_metadata_list,
             blocks_to_swap_in,
@@ -223,9 +247,11 @@ def _run_speculative_decoding_step(
             proposals,
         )
 
+        logger.info("verify proposals")
         accepted_token_ids = self._verify_tokens(seq_group_metadata_list,
                                                  proposal_scores, proposals, k)
 
+        logger.info("create output list")
         return self._create_output_sampler_list(seq_group_metadata_list,
                                                 accepted_token_ids, k)
 
@@ -311,7 +337,7 @@ def _create_output_sampler_list(
                                 parent_seq_id=seq_id,
                                 output_token=token_id,
                                 # TODO Add verifier logprobs.
-                                logprobs={token_id: 0.0},
+                                logprobs={token_id: Logprob(0.0)},
                             )
                         ],
                         prompt_logprobs=None,
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 406568a4bc08c..eb6d4ca1da8e6 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -82,6 +82,32 @@ def sampler_output_to_torch(
     return sampled_token_ids, sampled_token_probs
 
 
+def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int,
+                              vocab_size: int, device: str) -> None:
+    """Helper method which mocks out the GPU tensors in SamplerOutput with dummy
+    values. This will be removed in PR 7/9.
+    https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer
+    """
+    values = [
+        sampler_output.sampled_token_probs, sampler_output.sampled_token_ids
+    ]
+    assert all(v is None for v in values) or not any(v is None for v in values)
+    if not any(v is None for v in values):
+        # Do nothing if the tensors are already created (usually in unit tests).
+        return
+
+    # Softmax to ensure valid probs.
+    sampler_output.sampled_token_probs = torch.nn.functional.softmax(
+        torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device),
+        dim=-1)
+
+    sampler_output.sampled_token_ids = torch.randint(low=10,
+                                                     high=100,
+                                                     size=(batch_size, ),
+                                                     dtype=torch.long,
+                                                     device=device)
+
+
 @contextmanager
 def nvtx_range(msg, *args, **kwargs):
     """ 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 6610b9c4be876..afc4a1e1f4630 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -251,7 +251,7 @@ def execute_model(
         blocks_to_swap_in: Optional[Dict[int, int]] = None,
         blocks_to_swap_out: Optional[Dict[int, int]] = None,
         blocks_to_copy: Optional[Dict[int, List[int]]] = None,
-    ) -> Optional[SamplerOutput]:
+    ) -> List[SamplerOutput]:
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             num_seq_groups = len(seq_group_metadata_list)
@@ -274,11 +274,13 @@ def execute_model(
 
         # If there is no input, we don't need to execute the model.
         if num_seq_groups == 0:
-            return {}
+            return []
 
         output = self.model_runner.execute_model(seq_group_metadata_list,
                                                  self.cpu_cache)
-        return output
+
+        # CPU worker only supports single-step execution.
+        return [output]
 
     def init_distributed_environment(self) -> None:
         """Initialize the distributed environment."""
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 2f22f82c045db..142c6c97f5194 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,5 +1,5 @@
 """A Neuron worker class."""
-from typing import List, Optional, Tuple
+from typing import List, Tuple
 
 import torch
 import torch.distributed
@@ -73,15 +73,18 @@ def initialize_cache(self, num_gpu_blocks: int,
     def execute_model(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Optional[SamplerOutput]:
+    ) -> List[SamplerOutput]:
         num_seq_groups = len(seq_group_metadata_list)
 
         # If there is no input, we don't need to execute the model.
         if num_seq_groups == 0:
-            return {}
+            return []
 
         output = self.model_runner.execute_model(seq_group_metadata_list)
-        return output
+
+        # Neuron worker only supports single-step output. Wrap the output in a
+        # list to conform to interface.
+        return [output]
 
     def get_cache_block_size_bytes(self) -> int:
         """Determine the size in bytes of a cache block.
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 6a79285f60579..e2b47530d41e4 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -210,7 +210,9 @@ def execute_model(
         blocks_to_swap_in: Optional[Dict[int, int]] = None,
         blocks_to_swap_out: Optional[Dict[int, int]] = None,
         blocks_to_copy: Optional[Dict[int, List[int]]] = None,
-    ) -> Optional[SamplerOutput]:
+        num_lookahead_slots: int = 0,
+    ) -> List[SamplerOutput]:
+
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             num_seq_groups = len(seq_group_metadata_list)
@@ -235,11 +237,14 @@ def execute_model(
 
         # If there is no input, we don't need to execute the model.
         if num_seq_groups == 0:
-            return {}
+            return []
 
         output = self.model_runner.execute_model(seq_group_metadata_list,
                                                  self.gpu_cache)
-        return output
+
+        # Worker only supports single-step execution. Wrap the output in a list
+        # to conform to interface.
+        return [output]
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index d8c9febb11584..a92f5aea76059 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -40,12 +40,13 @@ def initialize_cache(self, num_gpu_blocks: int,
         raise NotImplementedError
 
     @abstractmethod
-    def execute_model(self,
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      blocks_to_swap_in: Dict[int, int],
-                      blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
-        """Executes one model step on the given sequences."""
+    def execute_model(
+            self, seq_group_metadata_list: List[SequenceGroupMetadata],
+            blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int,
+                                                                        int],
+            blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]:
+        """Executes at least one model step on the given sequences, unless no
+        sequences are provided."""
         raise NotImplementedError
 
     @abstractmethod

From a84676e385ec14cf360b3ef37b3255f6d8410242 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 16 Apr 2024 17:56:01 -0700
Subject: [PATCH 029/106] [Misc] [CI] Fix CI failure caught after merge (#4126)

---
 vllm/executor/gpu_executor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 6e4a765e2ffd5..77c997f97956e 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -68,6 +68,7 @@ def _init_spec_worker(self):
             scheduler_config=self.scheduler_config,
             device_config=self.device_config,
             cache_config=self.cache_config,
+            load_config=self.load_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method,
@@ -82,6 +83,7 @@ def _init_spec_worker(self):
             scheduler_config=self.scheduler_config,
             device_config=self.device_config,
             cache_config=self.cache_config,
+            load_config=self.load_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method,

From f56e1aee3db3258b7a211d607f5dd9a977937cfb Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 16 Apr 2024 22:53:26 -0700
Subject: [PATCH 030/106] [CI] Move CPU/AMD tests to after wait (#4123)

---
 .buildkite/test-template.j2 | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 3ed23c62c005d..0e1acc9777d4b 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -3,13 +3,6 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}
 
 steps:
-  - label: "AMD Test"
-    agents:
-      queue: amd
-    command: bash .buildkite/run-amd-test.sh
-
-  - label: "CPU Test"
-    command: bash .buildkite/run-cpu-test.sh
 
   - label: ":docker: build image"
     commands:
@@ -23,6 +16,14 @@ steps:
           limit: 5
   - wait
 
+  - label: "AMD Test"
+    agents:
+      queue: amd
+    command: bash .buildkite/run-amd-test.sh
+
+  - label: "CPU Test"
+    command: bash .buildkite/run-cpu-test.sh
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:

From 0d9aabe3319d6e041d7fe8488ec7d383a342c5ac Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 17 Apr 2024 01:34:33 -0700
Subject: [PATCH 031/106] [Core] RayWorkerVllm --> WorkerWrapper to reduce
 duplication (#4024)

[Core] replace narrow-usage RayWorkerVllm to general WorkerWrapper to reduce code duplication (#4024)
---
 tests/distributed/test_pynccl.py  |   7 +-
 vllm/engine/ray_utils.py          |  44 ++-------
 vllm/executor/ray_gpu_executor.py | 156 ++++++++++++++++--------------
 vllm/utils.py                     |  16 ++-
 vllm/worker/cpu_worker.py         |   5 +-
 vllm/worker/neuron_worker.py      |   4 +
 vllm/worker/worker.py             |   4 +
 vllm/worker/worker_base.py        |  56 +++++++++++
 8 files changed, 176 insertions(+), 116 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index b50eed1c8c722..d58f621d36b86 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,18 +1,18 @@
 import multiprocessing
-import os
 
 import pytest
 import torch
 
 from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
                                                           ncclGetUniqueId)
+from vllm.utils import update_environment_variables
 
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
     processes = []
     for i in range(number_of_processes):
-        env = os.environ.copy()
+        env = {}
         env['RANK'] = str(i)
         env['LOCAL_RANK'] = str(i)
         env['WORLD_SIZE'] = str(number_of_processes)
@@ -32,8 +32,7 @@ def update_env(fn):
     # so we need to pass the environment variables as arguments
     # and update the environment variables in the function
     def wrapper(env):
-        import os
-        os.environ.update(env)
+        update_environment_variables(env)
         fn()
 
     return wrapper
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index 04d4ed83976d0..febae42b84549 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -1,55 +1,28 @@
 import pickle
-from typing import Callable, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_ip, is_hip, set_cuda_visible_devices
-from vllm.worker.worker import Worker
+from vllm.utils import get_ip, is_hip
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
 try:
     import ray
 
-    class RayWorkerVllm:
+    class RayWorkerWrapper(WorkerWrapperBase):
         """Ray wrapper for vllm.worker.Worker, allowing Worker to be
         lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
 
-        def __init__(self, init_cached_hf_modules=False) -> None:
-            if init_cached_hf_modules:
-                from transformers.dynamic_module_utils import init_hf_modules
-                init_hf_modules()
-            self._worker: Optional[Worker] = None
+        def __init__(self, *args, **kwargs) -> None:
+            super().__init__(*args, **kwargs)
             # Since the compiled DAG runs a main execution
             # in a different thread that calls cuda.set_device.
             # The flag indicates is set_device is called on
             # that thread.
             self.compiled_dag_cuda_device_set = False
 
-        def init_worker(self, worker_init_fn: Callable[[], Worker]):
-            self._worker = worker_init_fn()
-
-        @property
-        def worker(self) -> Worker:
-            assert self._worker is not None
-            return self._worker
-
-        def __getattr__(self, name):
-            return getattr(self.worker, name)
-
-        def execute_method(self, method, *args, **kwargs):
-            try:
-                executor = getattr(self, method)
-                return executor(*args, **kwargs)
-            except Exception as e:
-                # exceptions in ray worker may cause deadlock
-                # see https://github.com/vllm-project/vllm/issues/3455
-                # print the error and inform the user to solve the error
-                msg = (f"Error executing method {method}. "
-                       "This might cause deadlock in distributed execution.")
-                logger.exception(msg)
-                raise e
-
         def get_node_ip(self) -> str:
             return get_ip()
 
@@ -58,9 +31,6 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             gpu_ids = ray.get_gpu_ids()
             return node_id, gpu_ids
 
-        def set_cuda_visible_devices(self, device_ids) -> None:
-            set_cuda_visible_devices(device_ids)
-
         def execute_model_compiled_dag_remote(self, ignored):
             """Used only when compiled DAG is enabled."""
             import torch
@@ -77,7 +47,7 @@ def execute_model_compiled_dag_remote(self, ignored):
                    "For distributed inference, please install Ray with "
                    "`pip install ray`.")
     ray = None  # type: ignore
-    RayWorkerVllm = None  # type: ignore
+    RayWorkerWrapper = None  # type: ignore
 
 
 def initialize_ray_cluster(
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 5f859fdc9c078..5a43f1fc28a84 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -1,17 +1,16 @@
 import asyncio
-import copy
 import os
 import pickle
 from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
 
-from vllm.engine.ray_utils import RayWorkerVllm, ray
+from vllm.engine.ray_utils import RayWorkerWrapper, ray
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async, set_cuda_visible_devices)
+                        make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -74,9 +73,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # The driver dummy worker does not actually use any resources.
         # It holds the resource for the driver worker.
-        self.driver_dummy_worker: RayWorkerVllm = None
+        self.driver_dummy_worker: RayWorkerWrapper = None
         # The remaining workers are the actual ray actors.
-        self.workers: List[RayWorkerVllm] = []
+        self.workers: List[RayWorkerWrapper] = []
 
         if self.parallel_config.ray_workers_use_nsight:
             ray_remote_kwargs = self._configure_ray_workers_use_nsight(
@@ -97,13 +96,20 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 num_gpus=num_gpus,
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
-            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
+            )(RayWorkerWrapper).remote(
+                worker_module_name="vllm.worker.worker",
+                worker_class_name="Worker",
+            )
 
             worker_ip = ray.get(worker.get_node_ip.remote())
             if worker_ip == driver_ip and self.driver_dummy_worker is None:
                 # If the worker is on the same node as the driver, we use it
                 # as the resource holder for the driver process.
                 self.driver_dummy_worker = worker
+                self.driver_worker = RayWorkerWrapper(
+                    worker_module_name="vllm.worker.worker",
+                    worker_class_name="Worker",
+                )
             else:
                 # Else, added to the list of workers.
                 self.workers.append(worker)
@@ -115,82 +121,56 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 "GPU node.")
 
         # Get the set of GPU IDs used on each node.
-        driver_node_id, driver_gpu_ids = ray.get(
-            self.driver_dummy_worker.get_node_and_gpu_ids.remote())
-        worker_node_and_gpu_ids = ray.get(
-            [worker.get_node_and_gpu_ids.remote() for worker in self.workers])
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+                                                    use_dummy_driver=True)
 
         node_workers = defaultdict(list)
         node_gpus = defaultdict(list)
 
-        node_workers[driver_node_id].append(0)
-        node_gpus[driver_node_id].extend(driver_gpu_ids)
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids,
-                                               start=1):
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
             node_workers[node_id].append(i)
             node_gpus[node_id].extend(gpu_ids)
         for node_id, gpu_ids in node_gpus.items():
             node_gpus[node_id] = sorted(gpu_ids)
 
         # Set CUDA_VISIBLE_DEVICES for the driver and workers.
-        set_cuda_visible_devices(node_gpus[driver_node_id])
-        for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids):
-            worker.set_cuda_visible_devices.remote(node_gpus[node_id])
+        all_args_to_update_environment_variables = []
+        for (node_id, _) in worker_node_and_gpu_ids:
+            all_args_to_update_environment_variables.append([{
+                "CUDA_VISIBLE_DEVICES":
+                ",".join(map(str, node_gpus[node_id]))
+            }])
+        self._run_workers("update_environment_variables",
+                          all_args=all_args_to_update_environment_variables)
 
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
-        # Lazy import the Worker to avoid importing torch.cuda/xformers
-        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.worker import Worker
-
-        model_config = copy.deepcopy(self.model_config)
-        parallel_config = copy.deepcopy(self.parallel_config)
-        scheduler_config = copy.deepcopy(self.scheduler_config)
-        load_config = copy.deepcopy(self.load_config)
-        device_config = copy.deepcopy(self.device_config)
-        lora_config = copy.deepcopy(self.lora_config)
-        cache_config = copy.deepcopy(self.cache_config)
-        vision_language_config = copy.deepcopy(self.vision_language_config)
-
-        # Initialize the actual workers with the Worker class.
-        for rank, (worker, (node_id, _)) in enumerate(
-                zip(self.workers, worker_node_and_gpu_ids),
-                start=1,
-        ):
+        def collect_arg_helper_func(**kwargs):
+            # avoid writing `{"name": value}` manually
+            return kwargs
+
+        init_worker_all_kwargs = []
+
+        # Initialize the actual workers inside worker wrapper.
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ):
             local_rank = node_workers[node_id].index(rank)
-            worker.init_worker.remote(
-                lambda rank=rank, local_rank=local_rank: Worker(
-                    model_config=model_config,
-                    parallel_config=parallel_config,
-                    scheduler_config=scheduler_config,
-                    device_config=device_config,
-                    cache_config=cache_config,
-                    load_config=load_config,
+            init_worker_all_kwargs.append(
+                collect_arg_helper_func(
+                    model_config=self.model_config,
+                    parallel_config=self.parallel_config,
+                    scheduler_config=self.scheduler_config,
+                    device_config=self.device_config,
+                    cache_config=self.cache_config,
+                    load_config=self.load_config,
                     local_rank=local_rank,
                     rank=rank,
                     distributed_init_method=distributed_init_method,
-                    lora_config=lora_config,
-                    vision_language_config=vision_language_config,
+                    lora_config=self.lora_config,
+                    vision_language_config=self.vision_language_config,
+                    is_driver_worker=rank == 0,
                 ))
-
-        # Initialize the driver worker with the Worker class.
-        driver_rank = 0
-        driver_local_rank = node_workers[driver_node_id].index(driver_rank)
-        self.driver_worker = Worker(
-            model_config=self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            device_config=self.device_config,
-            cache_config=self.cache_config,
-            local_rank=driver_local_rank,
-            rank=driver_rank,
-            distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
-            load_config=self.load_config,
-            is_driver_worker=True,
-        )
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
         self._run_workers("init_device")
         self._run_workers(
@@ -279,13 +259,35 @@ def _run_workers(
         self,
         method: str,
         *args,
-        driver_args: Optional[Tuple[Any, ...]] = None,
+        driver_args: Optional[Tuple[Any]] = None,
         driver_kwargs: Optional[Dict[str, Any]] = None,
+        all_args: Optional[List[List[Any]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         use_ray_compiled_dag: bool = False,
         **kwargs,
     ) -> Any:
-        """Runs the given method on all workers."""
+        """Runs the given method on all workers.
+        all_args and all_kwargs are used to pass heterogeneous arguments,
+        i.e. different arguments for each worker.
+        """
+        if driver_args is None:
+            driver_args = args
+        if driver_kwargs is None:
+            driver_kwargs = kwargs
+
+        # for mypy type checking
+        assert driver_args is not None
+        assert driver_kwargs is not None
+        if all_args is None:
+            all_args = [driver_args] + [args] * len(self.workers)
+        if all_kwargs is None:
+            all_kwargs = [driver_kwargs] + [kwargs] * len(self.workers)
+
+        # for mypy type checking
+        assert all_args is not None
+        assert all_kwargs is not None
 
         if max_concurrent_workers:
             raise NotImplementedError(
@@ -299,8 +301,10 @@ def _run_workers(
         else:
             # Start the ray workers first.
             ray_worker_outputs = [
-                worker.execute_method.remote(method, *args, **kwargs)
-                for worker in self.workers
+                worker.execute_method.remote(method, *worker_args,
+                                             **worker_kwargs)
+                for (worker, worker_args, worker_kwargs
+                     ) in zip(self.workers, all_args[1:], all_kwargs[1:])
             ]
 
         if driver_args is None:
@@ -309,9 +313,13 @@ def _run_workers(
             driver_kwargs = kwargs
 
         # Start the driver worker after all the ray workers.
-        driver_worker_output = getattr(self.driver_worker,
-                                       method)(*driver_args, **driver_kwargs)
-
+        if not use_dummy_driver:
+            driver_worker_output = self.driver_worker.execute_method(
+                method, *all_args[0], **all_kwargs[0])
+        else:
+            driver_worker_output = ray.get(
+                self.driver_dummy_worker.execute_method.remote(
+                    method, *all_args[0], **all_kwargs[0]))
         # Get the results of the ray workers.
         if self.workers:
             if use_ray_compiled_dag:
@@ -386,8 +394,12 @@ async def _run_workers_async(
             driver_kwargs = kwargs
 
         # Run the driver worker asynchronously.
-        driver_executor = make_async(getattr(self.driver_worker, method))
-        coros.append(driver_executor(*driver_args, **driver_kwargs))
+        def helper():
+            return self.driver_worker.execute_method(method, *driver_args,
+                                                     **driver_kwargs)
+
+        driver_executor = make_async(helper)
+        coros.append(driver_executor())
 
         # Run the ray workers asynchronously.
         for worker in self.workers:
diff --git a/vllm/utils.py b/vllm/utils.py
index 9e0ec6eab9383..dabd96e7489d8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -271,8 +271,12 @@ def get_open_port() -> int:
             return s.getsockname()[1]
 
 
-def set_cuda_visible_devices(device_ids: List[int]) -> None:
-    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids))
+def update_environment_variables(envs: Dict[str, str]):
+    for k, v in envs.items():
+        if k in os.environ:
+            logger.warning(f"Overwriting environment variable {k} "
+                           f"from '{os.environ[k]}' to '{v}'")
+        os.environ[k] = v
 
 
 def chunk_list(lst, chunk_size):
@@ -505,3 +509,11 @@ def merge_dicts(dict1: Dict[Any, List[Any]],
         merged_dict[key].extend(value)
 
     return dict(merged_dict)
+
+
+def init_cached_hf_modules():
+    """
+    Lazy initialization of the Hugging Face modules.
+    """
+    from transformers.dynamic_module_utils import init_hf_modules
+    init_hf_modules()
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index afc4a1e1f4630..8468ace5a2fdc 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -138,7 +138,10 @@ def __init__(
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
-
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
         self.model_runner = CPUModelRunner(model_config,
                                            parallel_config,
                                            scheduler_config,
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 142c6c97f5194..d0e6aaed180e6 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -29,6 +29,10 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
 
         self.model_runner = NeuronModelRunner(model_config, parallel_config,
                                               scheduler_config, device_config)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index e2b47530d41e4..b021866965401 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -60,6 +60,10 @@ def __init__(
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
         self.vision_language_config = vision_language_config
         if self.vision_language_config:
             assert not self.lora_config, (
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index a92f5aea76059..309aa6256acea 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,8 +1,14 @@
+import importlib
+import os
 from abc import ABC, abstractmethod
 from typing import Dict, List, Tuple
 
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import update_environment_variables
+
+logger = init_logger(__name__)
 
 
 class WorkerBase(ABC):
@@ -82,3 +88,53 @@ def remove_lora(self, lora_id: int) -> bool:
 
     def list_loras(self) -> List[int]:
         raise ValueError(f"{type(self)} does not support LoRA")
+
+
+class WorkerWrapperBase:
+    """
+    The whole point of this class is to lazily initialize the worker.
+    We first instantiate the WorkerWrapper, which remembers the worker module
+    and class name. Then, when we call `update_environment_variables`, and the
+    real initialization happens in `init_worker`.
+    """
+
+    def __init__(self,
+                 worker_module_name=None,
+                 worker_class_name=None) -> None:
+        self.worker_module_name = worker_module_name
+        self.worker_class_name = worker_class_name
+        self.worker = None
+
+    def update_environment_variables(self, envs: Dict[str, str]) -> None:
+        key = 'CUDA_VISIBLE_DEVICES'
+        if key in envs and key in os.environ:
+            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
+            # suppress the warning in `update_environment_variables`
+            del os.environ[key]
+        update_environment_variables(envs)
+
+    def init_worker(self, *args, **kwargs):
+        """
+        Actual initialization of the worker class.
+        Arguments are passed to the worker class constructor.
+        """
+        mod = importlib.import_module(self.worker_module_name)
+        worker_class = getattr(mod, self.worker_class_name)
+        self.worker = worker_class(*args, **kwargs)
+
+    def execute_method(self, method, *args, **kwargs):
+        try:
+            if hasattr(self, method):
+                executor = getattr(self, method)
+            else:
+                executor = getattr(self.worker, method)
+            return executor(*args, **kwargs)
+        except Exception as e:
+            # if the driver worker also execute methods,
+            # exceptions in the rest worker may cause deadlock in rpc like ray
+            # see https://github.com/vllm-project/vllm/issues/3455
+            # print the error and inform the user to solve the error
+            msg = (f"Error executing method {method}. "
+                   "This might cause deadlock in distributed execution.")
+            logger.exception(msg)
+            raise e

From ffd9ca857a299a07c020b5e11a4480116aa29717 Mon Sep 17 00:00:00 2001
From: Elinx <xizzuli@163.com>
Date: Wed, 17 Apr 2024 19:07:23 +0800
Subject: [PATCH 032/106] [Bugfix] fix output parsing error for trtllm backend
 (#4137)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 benchmarks/backend_request_func.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 51c395e5698b5..34c22d2d9879a 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -139,6 +139,7 @@ async def async_request_trt_llm(
                                               "data:")
 
                         data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
                         timestamp = time.perf_counter()
                         # First token
                         if ttft == 0.0:
@@ -153,7 +154,6 @@ async def async_request_trt_llm(
                         most_recent_timestamp = timestamp
 
                     output.latency = most_recent_timestamp - st
-                    output.generated_text = json.loads(data)["text_output"]
                     output.success = True
 
                 else:

From 302870bb7f3a3a67808e80011ea2478e7baa967f Mon Sep 17 00:00:00 2001
From: Shoichi Uchinami <s.uchinami@gmail.com>
Date: Thu, 18 Apr 2024 02:02:45 +0900
Subject: [PATCH 033/106] [Kernel] Add punica dimension for Swallow-MS-7B LoRA
 (#4134)

---
 csrc/punica/bgmv/bgmv_config.h | 1 +
 tests/lora/test_punica.py      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index d2906914f927e..fec484d693055 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -60,6 +60,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 32768) \
     f(in_T, out_T, W_T, narrow, 33024) \
     f(in_T, out_T, W_T, narrow, 36864) \
+    f(in_T, out_T, W_T, narrow, 43264) \
     f(in_T, out_T, W_T, narrow, 49152) \
     f(in_T, out_T, W_T, narrow, 64000) \
     f(in_T, out_T, W_T, narrow, 64256) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index 8b174f01d87d4..f3b9bd5912967 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -82,6 +82,7 @@ def _lora_ref_impl(
     32768,
     33024,
     36864,
+    43264,
     49152,
     64000,
     64256,

From e69ff115219e6940ce098b9d40d9d97338ac8323 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Thu, 18 Apr 2024 09:28:43 +0900
Subject: [PATCH 034/106] [Typing] Mypy typing part 2 (#4043)

Co-authored-by: SangBin Cho <sangcho@sangcho-LT93GQWG9C.local>
---
 .github/workflows/mypy.yaml                   |  8 +-
 format.sh                                     |  8 +-
 vllm/engine/async_llm_engine.py               | 44 +++++----
 vllm/lora/worker_manager.py                   |  4 +-
 .../guided_decoding/outlines_decoding.py      |  4 +-
 .../outlines_logits_processors.py             |  6 +-
 vllm/model_executor/model_loader/neuron.py    | 16 ++--
 .../model_executor/model_loader/tensorizer.py |  1 +
 vllm/model_executor/sampling_metadata.py      |  4 +
 vllm/spec_decode/batch_expansion.py           |  6 +-
 vllm/spec_decode/interfaces.py                |  4 +-
 vllm/spec_decode/metrics.py                   |  1 +
 vllm/spec_decode/multi_step_worker.py         | 21 ++--
 vllm/spec_decode/spec_decode_worker.py        |  6 +-
 vllm/worker/cpu_model_runner.py               | 25 +++--
 vllm/worker/cpu_worker.py                     | 11 ++-
 vllm/worker/model_runner.py                   | 96 +++++++++++--------
 vllm/worker/neuron_model_runner.py            | 22 +++--
 vllm/worker/worker.py                         | 11 ++-
 vllm/worker/worker_base.py                    |  8 +-
 20 files changed, 180 insertions(+), 126 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 6db0bb7645ecd..477ce9bc9ce85 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -41,10 +41,10 @@ jobs:
         mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
         mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
 
+        mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
         # TODO(sang): Follow up
-        # mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-        # mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-        # mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
-        # mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
         # mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
 
diff --git a/format.sh b/format.sh
index 1c195b899c742..84ee88b5b4c8a 100755
--- a/format.sh
+++ b/format.sh
@@ -104,10 +104,10 @@ mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
 mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
 
 # TODO(sang): Follow up
-# mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
 # mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
 
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 27192449bf15a..c3020d2b38db0 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -2,8 +2,8 @@
 import os
 import time
 from functools import partial
-from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
-                    Set, Tuple, Type, Union)
+from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
+                    Optional, Set, Tuple, Type, Union)
 
 from transformers import PreTrainedTokenizer
 
@@ -52,7 +52,7 @@ class AsyncStream:
 
     def __init__(self, request_id: str) -> None:
         self.request_id = request_id
-        self._queue = asyncio.Queue()
+        self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
     def put(self, item: Union[RequestOutput, Exception]) -> None:
@@ -312,15 +312,17 @@ def __init__(self,
         self.max_log_len = max_log_len
         self.engine = self._init_engine(*args, **kwargs)
 
-        self.background_loop = None
+        self.background_loop: Optional[asyncio.Future] = None
         # We need to keep a reference to unshielded
         # task as well to prevent it from being garbage
         # collected
-        self._background_loop_unshielded = None
+        self._background_loop_unshielded: Optional[asyncio.Task[Any]] = None
         self.start_engine_loop = start_engine_loop
-        self._request_tracker: Optional[RequestTracker] = None
         self._errored_with: Optional[BaseException] = None
 
+        # Lazy initialized fields
+        self._request_tracker: RequestTracker
+
     @classmethod
     def from_engine_args(
         cls,
@@ -361,11 +363,13 @@ def from_engine_args(
     @property
     def is_running(self) -> bool:
         return (self.background_loop is not None
+                and self._background_loop_unshielded is not None
                 and not self._background_loop_unshielded.done())
 
     @property
     def is_stopped(self) -> bool:
-        return self.errored or (self.background_loop is not None
+        return self.errored or (self.background_loop is not None and
+                                self._background_loop_unshielded is not None
                                 and self._background_loop_unshielded.done())
 
     @property
@@ -381,7 +385,7 @@ def _error_callback(self, exc: Exception) -> None:
 
     async def get_tokenizer(self) -> "PreTrainedTokenizer":
         if self.engine_use_ray:
-            return await self.engine.get_tokenizer.remote()
+            return await self.engine.get_tokenizer.remote()  # type: ignore
         else:
             return self.engine.get_tokenizer()
 
@@ -434,7 +438,8 @@ async def engine_step(self) -> bool:
             # TODO: Maybe add add_request_batch to reduce Ray overhead
             try:
                 if self.engine_use_ray:
-                    await self.engine.add_request.remote(**new_request)
+                    await self.engine.add_request.remote(  # type: ignore
+                        **new_request)
                 else:
                     await self.engine.add_request_async(**new_request)
             except ValueError as e:
@@ -449,7 +454,7 @@ async def engine_step(self) -> bool:
             await self._engine_abort(finished_requests)
 
         if self.engine_use_ray:
-            request_outputs = await self.engine.step.remote()
+            request_outputs = await self.engine.step.remote()  # type: ignore
         else:
             request_outputs = await self.engine.step_async()
 
@@ -462,7 +467,7 @@ async def engine_step(self) -> bool:
 
     async def _engine_abort(self, request_ids: Iterable[str]):
         if self.engine_use_ray:
-            await self.engine.abort_request.remote(request_ids)
+            await self.engine.abort_request.remote(request_ids)  # type: ignore
         else:
             self.engine.abort_request(request_ids)
 
@@ -525,11 +530,12 @@ async def add_request(
             arrival_time = time.time()
 
         if self.engine_use_ray:
-            prompt_token_ids = await self.engine.encode_request_async.remote(
-                request_id=request_id,
-                prompt=prompt,
-                prompt_token_ids=prompt_token_ids,
-                lora_request=lora_request)
+            prompt_token_ids = await (
+                self.engine.encode_request_async.remote(  # type: ignore
+                    request_id=request_id,
+                    prompt=prompt,
+                    prompt_token_ids=prompt_token_ids,
+                    lora_request=lora_request))
         else:
             prompt_token_ids = await self.engine.encode_request_async(
                 request_id=request_id,
@@ -676,13 +682,13 @@ def _abort(self, request_id: str) -> None:
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         if self.engine_use_ray:
-            return await self.engine.get_model_config.remote()
+            return await self.engine.get_model_config.remote()  # type: ignore
         else:
             return self.engine.get_model_config()
 
     async def do_log_stats(self) -> None:
         if self.engine_use_ray:
-            await self.engine.do_log_stats.remote()
+            await self.engine.do_log_stats.remote()  # type: ignore
         else:
             self.engine.do_log_stats()
 
@@ -695,7 +701,7 @@ async def check_health(self) -> None:
 
         if self.engine_use_ray:
             try:
-                await self.engine.check_health.remote()
+                await self.engine.check_health.remote()  # type: ignore
             except ray.exceptions.RayActorError as e:
                 raise RuntimeError("Engine is dead.") from e
         else:
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index a0868defbd3ca..5356b79537b05 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -107,12 +107,12 @@ def create_lora_manager(
         self._lora_manager: LoRAModelManager = lora_manager
         return lora_manager.model
 
-    def set_active_loras(self, lora_requests: List[LoRARequest],
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:
         self._apply_loras(lora_requests)
         self._lora_manager.set_lora_mapping(lora_mapping)
 
-    def _apply_loras(self, lora_requests: List[LoRARequest]) -> None:
+    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
         loras_that_exist = self.list_loras()
         loras_map = {
             lora_request.lora_int_id: lora_request
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index bd4564a36e1ed..53efebb604048 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -55,7 +55,7 @@ class GuidedDecodingMode(Enum):
 
 async def get_outlines_guided_decoding_logits_processor(
         request: Union[CompletionRequest, ChatCompletionRequest],
-        tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]:
+        tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
     """
     Given an OpenAI-compatible request, check for guided decoding parameters
     and get the necessary logits processor for the given guide.
@@ -84,7 +84,7 @@ async def get_outlines_guided_decoding_logits_processor(
 
 def _get_guide_and_mode(
     request: Union[CompletionRequest, ChatCompletionRequest]
-) -> Tuple[str, GuidedDecodingMode]:
+) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
 
     if request.guided_json:
         json = request.guided_json
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 28041695546dc..95a67b612f08b 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,7 +21,7 @@
 from typing import Callable, DefaultDict, Dict, List, Optional, Union
 
 import torch
-from outlines.fsm.fsm import CFGFSM, RegexFSM
+from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
 from outlines.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
@@ -29,6 +29,10 @@
 
 class BaseLogitsProcessor:
 
+    def __init__(self):
+        # Child class should use initialize in their init.
+        self.fsm: FSM
+
     def init_state(self):
         """Initialize the FSM states."""
         self.fsm_state: DefaultDict[int, int] = defaultdict(int)
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index 43d17ad373b87..07e23aca6cc5f 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -1,7 +1,7 @@
 """Utilities for selecting and loading neuron models."""
 import importlib
 import os
-from typing import Optional, Type
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -27,7 +27,7 @@
 }
 
 # Models supported by Neuron.
-_NEURON_SUPPORTED_MODELS = {
+_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = {
     "LlamaForCausalLM": ("transformers_neuronx.llama.model",
                          "LlamaForSampling", "LlamaForCausalLM"),
     "MistralForCausalLM": ("transformers_neuronx.mistral.model",
@@ -43,11 +43,13 @@ def __init__(
     ) -> None:
         super().__init__()
         self.config = config
-        self.model = None
         self.logits_processor = LogitsProcessor(config.vocab_size,
                                                 logits_as_input=True)
         self.sampler = Sampler()
 
+        # Lazy initialized
+        self.model: nn.Module
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -74,17 +76,17 @@ def sample(
 
     def load_weights(self, model_name_or_path: str, **kwargs):
         arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls, hf_model_cls = (
+        neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = (
             _NEURON_SUPPORTED_MODELS[arch])
         neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls)
+        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
 
         split_model_dir = f"{model_name_or_path}-split"
         if os.path.isdir(os.path.join(model_name_or_path,
                                       "pytorch_model.bin")):
             split_model_dir = model_name_or_path
         elif not os.path.exists(f"{model_name_or_path}-split"):
-            hf_model_cls = getattr(transformers, hf_model_cls)
+            hf_model_cls = getattr(transformers, hf_model_cls_name)
             from transformers_neuronx.module import save_pretrained_split
 
             hf_model = hf_model_cls.from_pretrained(model_name_or_path,
@@ -96,7 +98,7 @@ def load_weights(self, model_name_or_path: str, **kwargs):
         self.model.to_neuron()
 
 
-def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+def _get_model_architecture(config: PretrainedConfig) -> str:
     architectures = getattr(config, "architectures", [])
     for arch in architectures:
         if arch in _NEURON_SUPPORTED_MODELS:
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index ad554844384eb..16be0ecf9ce07 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -167,6 +167,7 @@ def __post_init__(self):
                 decryption_params = DecryptionParams.from_key(key)
                 self.deserializer_params['encryption'] = decryption_params
 
+    @staticmethod
     def add_cli_args(
             parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         """Tensorizer CLI arguments"""
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 534cb75c2fd2f..31032c4cead20 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -113,6 +113,8 @@ def from_sampling_metadata(
                              get_num_triton_sampler_splits(vocab_size))
 
         sample_indices_start_idx = 0
+        assert sampling_metadata.seq_groups is not None
+        assert sampling_metadata.seq_data is not None
         for i, seq_group in enumerate(sampling_metadata.seq_groups):
             seq_ids, sampling_params = seq_group
             temperature = sampling_params.temperature
@@ -147,6 +149,7 @@ def from_sampling_metadata(
                     and sampling_params.prompt_logprobs is not None):
                 # For tokens in the prompt that we only need to get
                 # their logprobs
+                assert sampling_metadata.prompt_lens is not None
                 prompt_len = sampling_metadata.prompt_lens[i]
                 temperatures += [temperature] * (prompt_len - 1)
                 top_ps += [top_p] * (prompt_len - 1)
@@ -172,6 +175,7 @@ def from_sampling_metadata(
             is_prompt = i < sampling_metadata.num_prompts
             if is_prompt:
                 prompt_best_of.append(sampling_params.best_of)
+                assert sampling_metadata.prompt_lens is not None
                 prompt_len = sampling_metadata.prompt_lens[i]
 
                 if sampling_params.prompt_logprobs is not None:
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 88af1dd360155..bbc5b1778854f 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -106,7 +106,7 @@ def score_proposals(
     def _expand_batch(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-        proposal_token_ids_list: List[TokenId],
+        proposal_token_ids_list: List[List[TokenId]],
         proposal_lens_list: List[int],
     ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]:
         """Given the input sequences and potentially multiple corresponding
@@ -218,7 +218,7 @@ def _create_scoring_model_input(
     def _create_target_seq_group_metadata(
         self,
         input_seq_group_metadata: SequenceGroupMetadata,
-        proposal_token_ids: List[TokenId],  # shape: [batch_size, k]
+        proposal_token_ids: List[List[TokenId]],  # shape: [batch_size, k]
         batch_index: int,
         target_seq_ids_iter: Iterator[TargetSeqId],
     ) -> List[SequenceGroupMetadata]:
@@ -360,7 +360,7 @@ def _get_token_ids_to_score(
                 [0, 1, 2]
                 [0, 1, 2, 3]
         """
-        empty_token_ids = []
+        empty_token_ids: List[TokenId] = []
 
         token_ids_to_score = [empty_token_ids]
         token_ids_to_score.extend([
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index 2a72974d01bdc..f0715120192e5 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
 
 import torch
 
@@ -73,5 +73,5 @@ def score_proposals(
         blocks_to_copy: Optional[Dict[int, List[int]]],
         k: int,
         proposals: SpeculativeProposals,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> SpeculativeScores:
         raise NotImplementedError
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 5df8fc4316d48..d1e72b6640548 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -112,6 +112,7 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
 
         Returns a CUDA event recording when the copy is complete.
         """
+        assert self._copy_stream is not None
         self._copy_stream.wait_stream(torch.cuda.current_stream())
 
         with torch.cuda.stream(self._copy_stream):
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index ce63c329a40aa..8b722476853fa 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -26,7 +26,8 @@ class MultiStepWorker(Worker):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self._proposer: Optional[DraftModelTop1Proposer] = None
+        # Lazy initialization list.
+        self._proposer: DraftModelTop1Proposer
 
     def init_device(self):
         super().init_device()
@@ -338,10 +339,10 @@ def _merge_outputs(
                                          self._vocab_size,
                                          dtype=torch.float32,
                                          device=self._device)
-            proposal_lens = torch.zeros(len(proposal_lens),
-                                        dtype=torch.long,
-                                        device=self._device)
-            return proposal_tokens, proposal_probs, proposal_lens
+            proposal_lens_tensor = torch.zeros(len(proposal_lens),
+                                               dtype=torch.long,
+                                               device=self._device)
+            return proposal_tokens, proposal_probs, proposal_lens_tensor
 
         sampler_output = maybe_sampler_output
 
@@ -376,9 +377,9 @@ def _merge_outputs(
         proposal_tokens, proposal_probs = (entire_proposal_tokens,
                                            entire_proposal_probs)
 
-        proposal_lens = torch.zeros(batch_size,
-                                    dtype=torch.long,
-                                    device=self._device)
-        proposal_lens[nonzero_proposal_len_indices] = max_proposal_len
+        proposal_lens_tensor = torch.zeros(batch_size,
+                                           dtype=torch.long,
+                                           device=self._device)
+        proposal_lens_tensor[nonzero_proposal_len_indices] = max_proposal_len
 
-        return proposal_tokens, proposal_probs, proposal_lens
+        return proposal_tokens, proposal_probs, proposal_lens_tensor
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index be3af7be93864..68a2a774ef4b7 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -89,7 +89,8 @@ def __init__(
         self.probs_dtype = self.rejection_sampler.probs_dtype
         self.token_id_dtype = self.rejection_sampler.token_id_dtype
 
-        self.scorer: SpeculativeScorer = None
+        # Lazy initiazliation.
+        self.scorer: SpeculativeScorer
 
     def init_device(self) -> None:
         """Initialize both scorer and proposer models.
@@ -233,6 +234,9 @@ def _run_speculative_decoding_step(
 
         logger.info("get spec proposals")
         # Generate proposals using draft worker.
+        assert blocks_to_swap_in is not None
+        assert blocks_to_swap_out is not None
+        assert blocks_to_copy is not None
         proposals = self.proposer_worker.get_spec_proposals(
             seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
             blocks_to_copy, k)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index d378e3a90e1e7..7377c8931cefa 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Optional, Tuple
 
 import torch
+from torch import nn
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
@@ -48,14 +49,15 @@ def __init__(
                               if device_config is not None else DeviceConfig())
         self.device = self.device_config.device
 
-        self.model = None
-        self.block_size = None  # Set after initial profiling.
-
         self.kv_cache_dtype = kv_cache_dtype
 
         self.attn_backend = get_attn_backend(
             self.model_config.dtype if model_config is not None else None)
 
+        # Lazy initialization.
+        self.model: nn.Module  # Set after init_Model
+        self.block_size: int  # Set after initial profiling.
+
     def load_model(self) -> None:
         self.model = get_model(model_config=self.model_config,
                                load_config=self.load_config,
@@ -245,7 +247,11 @@ def _prepare_sample(
         selected_token_indices: List[int] = []
         generators: List[torch.Generator] = []
         selected_token_start_idx = 0
-        categorized_sample_indices = {t: [] for t in SamplingType}
+        categorized_sample_indices: Dict[SamplingType,
+                                         List[Tuple[int, int]]] = {
+                                             t: []
+                                             for t in SamplingType
+                                         }
         categorized_sample_indices_start_idx = 0
         categorized_sampled_token_indices_start_idx = 0
 
@@ -262,10 +268,9 @@ def _prepare_sample(
                     categorized_sample_indices_start_idx += subquery_len - 1
 
                 categorized_sample_indices[
-                    sampling_params.sampling_type].append([
-                        categorized_sample_indices_start_idx,
-                        categorized_sampled_token_indices_start_idx
-                    ])
+                    sampling_params.sampling_type].append(
+                        (categorized_sample_indices_start_idx,
+                         categorized_sampled_token_indices_start_idx))
                 categorized_sample_indices_start_idx += 1
                 categorized_sampled_token_indices_start_idx += 1
 
@@ -328,7 +333,7 @@ def _prepare_sample(
 
     def prepare_input_tensors(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata,
                SamplingMetadata]:
         if self.is_driver_worker:
@@ -381,7 +386,7 @@ def prepare_input_tensors(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        seq_group_metadata_list: List[SequenceGroupMetadata],
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
         (input_tokens, input_positions, attn_metadata, sampling_metadata
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 8468ace5a2fdc..3652830b7d519 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,5 @@
 """A CPU worker class."""
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -152,8 +152,8 @@ def __init__(
                                            is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine = None
-        self.cpu_cache = None
+        self.cache_engine: CPUCacheEngine
+        self.cpu_cache: List[torch.Tensor]
 
     def init_device(self) -> None:
         self.init_distributed_environment()
@@ -257,13 +257,13 @@ def execute_model(
     ) -> List[SamplerOutput]:
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
-            num_seq_groups = len(seq_group_metadata_list)
+            num_seq_groups: int = len(seq_group_metadata_list)
             assert blocks_to_swap_in is not None
             assert blocks_to_swap_out is not None
             assert blocks_to_copy is not None
             assert len(blocks_to_swap_in) == 0
             assert len(blocks_to_swap_out) == 0
-            data = {
+            data: Dict[str, Any] = {
                 "num_seq_groups": num_seq_groups,
                 "blocks_to_copy": blocks_to_copy,
             }
@@ -273,6 +273,7 @@ def execute_model(
             num_seq_groups = data["num_seq_groups"]
             blocks_to_copy = data["blocks_to_copy"]
 
+        assert blocks_to_copy is not None
         self.cache_copy(blocks_to_copy)
 
         # If there is no input, we don't need to execute the model.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1b816afe7c0e7..13e20d8524a1b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -130,23 +130,17 @@ def __init__(
                               if device_config is not None else DeviceConfig())
         self.device = self.device_config.device
 
-        self.model = None
-        self.block_size = None  # Set after initial profiling.
-        self.lora_manager = None
+        # Set after load_model.
+        self.lora_manager: LRUCacheWorkerLoRAManager = None
 
         self.graph_runners: Dict[int, CUDAGraphRunner] = {}
-        self.graph_memory_pool = None  # Set during graph capture.
+        self.graph_memory_pool: Optional[Tuple[
+            int, int]] = None  # Set during graph capture.
 
         self.max_context_len_to_capture = (
             self.model_config.max_context_len_to_capture
             if self.model_config is not None else 0)
-        # When using CUDA graph, the input block tables must be padded to
-        # max_context_len_to_capture. However, creating the block table in
-        # Python can be expensive. To optimize this, we cache the block table
-        # in numpy and only copy the actual input content at every iteration.
-        # The shape of the cached block table will be
-        # (max batch size to capture, max context len to capture / block size).
-        self.graph_block_tables = None  # Set after initial profiling.
+
         self.pin_memory = is_pin_memory_available()
         self.kv_cache_dtype = kv_cache_dtype
         self.vision_language_config = vision_language_config
@@ -154,6 +148,17 @@ def __init__(
         self.attn_backend = get_attn_backend(
             self.model_config.dtype if model_config is not None else None)
 
+        # Lazy initialization
+        self.model: torch.nn.Module  # Set after load_model
+        self.block_size: int  # Set after initial profiling.
+        # When using CUDA graph, the input block tables must be padded to
+        # max_context_len_to_capture. However, creating the block table in
+        # Python can be expensive. To optimize this, we cache the block table
+        # in numpy and only copy the actual input content at every iteration.
+        # The shape of the cached block table will be
+        # (max batch size to capture, max context len to capture / block size).
+        self.graph_block_tables: torch.Tensor  # Set after initial profiling.
+
     def load_model(self) -> None:
         with CudaMemoryProfiler() as m:
             self.model = get_model(
@@ -491,16 +496,16 @@ def _prepare_decode(
                 lora_index_mapping.append(0)
             batch_size = graph_batch_size
 
-        context_lens = torch.tensor(context_lens,
-                                    dtype=torch.int,
-                                    device=self.device)
+        context_lens_tensor = torch.tensor(context_lens,
+                                           dtype=torch.int,
+                                           device=self.device)
 
         if use_captured_graph:
             # When using cuda-graph all these tensors should be
             # padded.
-            assert context_lens.shape[0] == len(input_tokens)
-            assert context_lens.shape[0] == len(input_positions)
-            assert context_lens.shape[0] == len(slot_mapping)
+            assert context_lens_tensor.shape[0] == len(input_tokens)
+            assert context_lens_tensor.shape[0] == len(input_positions)
+            assert context_lens_tensor.shape[0] == len(slot_mapping)
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
@@ -529,7 +534,7 @@ def _prepare_decode(
             max_prompt_len=None,
             subquery_start_loc=None,
             seq_start_loc=None,
-            context_lens=context_lens,
+            context_lens=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
         )
@@ -553,7 +558,11 @@ def _prepare_sample(
         selected_token_indices: List[int] = []
         generators: List[torch.Generator] = []
         selected_token_start_idx = 0
-        categorized_sample_indices = {t: [] for t in SamplingType}
+        categorized_sample_indices: Dict[SamplingType,
+                                         List[Tuple[int, int]]] = {
+                                             t: []
+                                             for t in SamplingType
+                                         }
         categorized_sample_indices_start_idx = 0
         categorized_sampled_token_indices_start_idx = 0
 
@@ -571,10 +580,9 @@ def _prepare_sample(
                     categorized_sample_indices_start_idx += subquery_len - 1
 
                 categorized_sample_indices[
-                    sampling_params.sampling_type].append([
-                        categorized_sample_indices_start_idx,
-                        categorized_sampled_token_indices_start_idx
-                    ])
+                    sampling_params.sampling_type].append(
+                        (categorized_sample_indices_start_idx,
+                         categorized_sampled_token_indices_start_idx))
                 categorized_sample_indices_start_idx += 1
                 categorized_sampled_token_indices_start_idx += 1
 
@@ -598,15 +606,16 @@ def _prepare_sample(
 
                 categorized_sample_indices[
                     sampling_params.sampling_type].extend(
-                        zip(
-                            range(
-                                categorized_sample_indices_start_idx,
-                                categorized_sample_indices_start_idx +
-                                num_seqs),
-                            range(
-                                categorized_sampled_token_indices_start_idx,
-                                categorized_sampled_token_indices_start_idx +
-                                num_seqs)))
+                        list(
+                            zip(
+                                range(
+                                    categorized_sample_indices_start_idx,
+                                    categorized_sample_indices_start_idx +
+                                    num_seqs),
+                                range(
+                                    categorized_sampled_token_indices_start_idx,
+                                    categorized_sampled_token_indices_start_idx
+                                    + num_seqs))))
                 categorized_sample_indices_start_idx += num_seqs
                 categorized_sampled_token_indices_start_idx += num_seqs
 
@@ -643,9 +652,9 @@ def _prepare_sample(
 
     def prepare_input_tensors(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[int], LoRAMapping, torch.Tensor]:
+               Set[LoRARequest], LoRAMapping, torch.Tensor]:
         if self.is_driver_worker:
             prefill_reqs = []
             decode_reqs = []
@@ -743,6 +752,7 @@ def prepare_input_tensors(
             if prefill_attn_metadata is not None:
                 metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
             else:
+                assert decode_attn_metadata is not None
                 metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
             broadcast_tensor_dict(metadata_dict, src=0)
 
@@ -811,7 +821,7 @@ def prepare_input_tensors(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        seq_group_metadata_list: List[SequenceGroupMetadata],
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
         (input_tokens, input_positions, attn_metadata, sampling_metadata,
@@ -925,7 +935,7 @@ def remove_all_loras(self) -> bool:
             raise RuntimeError("LoRA is not enabled.")
         return self.lora_manager.remove_all_loras()
 
-    def set_active_loras(self, lora_requests: List[LoRARequest],
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
@@ -1069,10 +1079,16 @@ def __init__(self, model: nn.Module):
         super().__init__()
 
         self.model = model
-        self.graph = None
         self.input_buffers: Dict[str, torch.Tensor] = {}
         self.output_buffers: Dict[str, torch.Tensor] = {}
 
+        self._graph: Optional[torch.cuda.CUDAGraph] = None
+
+    @property
+    def graph(self):
+        assert self._graph is not None
+        return self._graph
+
     def capture(
         self,
         input_ids: torch.Tensor,
@@ -1082,7 +1098,7 @@ def capture(
         memory_pool,
         **kwargs,
     ) -> None:
-        assert self.graph is None
+        assert self._graph is None
         # Run the model once without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
@@ -1099,8 +1115,8 @@ def capture(
         # Capture the graph.
         # NOTE(woosuk): Python 3.8 does not support multi-line with statements.
         # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
-        self.graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(self.graph, pool=memory_pool):  # noqa: SIM117
+        self._graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self._graph, pool=memory_pool):  # noqa: SIM117
             with _maybe_pynccl():
                 hidden_states = self.model(
                     input_ids,
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index f70a7193effeb..487df334d73e3 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Optional, Tuple
 
 import torch
+from torch import nn
 
 from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig)
@@ -34,9 +35,11 @@ def __init__(
         self.device_config = (device_config
                               if device_config is not None else DeviceConfig())
         self.device = self.device_config.device
-        self.model = None
         self.pin_memory = is_pin_memory_available()
 
+        # Lazy initialization.
+        self.model: nn.Module  # initialize after load_model.
+
     def load_model(self) -> None:
         self.model = get_neuron_model(self.model_config,
                                       parallel_config=self.parallel_config,
@@ -147,7 +150,11 @@ def _prepare_sample(
         selected_token_indices: List[int] = []
         generators: List[torch.Generator] = []
         selected_token_start_idx = 0
-        categorized_sample_indices = {t: [] for t in SamplingType}
+        categorized_sample_indices: Dict[SamplingType,
+                                         List[Tuple[int, int]]] = {
+                                             t: []
+                                             for t in SamplingType
+                                         }
         categorized_sample_indices_start_idx = 0
         categorized_sampled_token_indices_start_idx = 0
 
@@ -165,10 +172,9 @@ def _prepare_sample(
                     categorized_sample_indices_start_idx += prompt_len - 1
 
                 categorized_sample_indices[
-                    sampling_params.sampling_type].append([
-                        categorized_sample_indices_start_idx,
-                        categorized_sampled_token_indices_start_idx
-                    ])
+                    sampling_params.sampling_type].append(
+                        (categorized_sample_indices_start_idx,
+                         categorized_sampled_token_indices_start_idx))
                 categorized_sample_indices_start_idx += 1
                 categorized_sampled_token_indices_start_idx += 1
 
@@ -237,7 +243,7 @@ def _prepare_sample(
 
     def prepare_input_tensors(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
@@ -259,7 +265,7 @@ def prepare_input_tensors(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Optional[SamplerOutput]:
         (input_tokens, input_positions, input_block_ids, sampling_metadata
          ) = self.prepare_input_tensors(seq_group_metadata_list)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index b021866965401..2203570b37ad6 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,7 +1,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import torch
 import torch.distributed
@@ -82,8 +82,8 @@ def __init__(
         )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine = None
-        self.gpu_cache = None
+        self.cache_engine: CacheEngine
+        self.gpu_cache: List[torch.Tensor]
 
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":
@@ -223,7 +223,7 @@ def execute_model(
             assert blocks_to_swap_in is not None
             assert blocks_to_swap_out is not None
             assert blocks_to_copy is not None
-            data = {
+            data: Dict[str, Any] = {
                 "num_seq_groups": num_seq_groups,
                 "blocks_to_swap_in": blocks_to_swap_in,
                 "blocks_to_swap_out": blocks_to_swap_out,
@@ -237,6 +237,9 @@ def execute_model(
             blocks_to_swap_out = data["blocks_to_swap_out"]
             blocks_to_copy = data["blocks_to_copy"]
 
+        assert blocks_to_swap_in is not None
+        assert blocks_to_swap_out is not None
+        assert blocks_to_copy is not None
         self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
 
         # If there is no input, we don't need to execute the model.
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 309aa6256acea..13e062fe64b29 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,7 +1,7 @@
 import importlib
 import os
 from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple
 
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -56,7 +56,7 @@ def execute_model(
         raise NotImplementedError
 
     @abstractmethod
-    def get_cache_block_size_bytes() -> int:
+    def get_cache_block_size_bytes(self) -> int:
         """Return the size of a single cache block, in bytes. Used in
         speculative decoding.
         """
@@ -71,7 +71,7 @@ def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         raise NotImplementedError
 
 
@@ -86,7 +86,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise ValueError(f"{type(self)} does not support LoRA")
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         raise ValueError(f"{type(self)} does not support LoRA")
 
 
From 085d445868a42fe57e0badb1d57ed091bf36da69 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 17 Apr 2024 22:28:52 -0700
Subject: [PATCH 035/106] [Core] nccl integrity check and test (#4155)

[Core] Add integrity check during initialization; add test for it (#4155)
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/distributed/test_pynccl_library.py      | 43 ++++++++++++++++
 .../device_communicators/pynccl.py            | 38 +++++---------
 vllm/utils.py                                 | 51 +++++++++++++++++++
 4 files changed, 107 insertions(+), 26 deletions(-)
 create mode 100644 tests/distributed/test_pynccl_library.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f39c3302ac2e9..2263dee20fbed 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -33,6 +33,7 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
   commands:
   - pytest -v -s test_pynccl.py
+  - pytest -v -s test_pynccl_library.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
diff --git a/tests/distributed/test_pynccl_library.py b/tests/distributed/test_pynccl_library.py
new file mode 100644
index 0000000000000..ec60a5ed3114d
--- /dev/null
+++ b/tests/distributed/test_pynccl_library.py
@@ -0,0 +1,43 @@
+import multiprocessing
+import tempfile
+
+
+def target_fn(env, filepath):
+    from vllm.utils import update_environment_variables
+    update_environment_variables(env)
+    from vllm.utils import nccl_integrity_check
+    nccl_integrity_check(filepath)
+
+
+def test_library_file():
+    # note: don't import vllm.distributed.device_communicators.pynccl
+    # before running this test, otherwise the library file will be loaded
+    # and it might interfere with the test
+    from vllm.utils import find_nccl_library
+    so_file = find_nccl_library()
+    with open(so_file, 'rb') as f:
+        content = f.read()
+    try:
+        # corrupt the library file, should raise an exception
+        with open(so_file, 'wb') as f:
+            f.write(content[:len(content) // 2])
+        p = multiprocessing.Process(target=target_fn, args=({}, so_file))
+        p.start()
+        p.join()
+        assert p.exitcode != 0
+
+        # move the library file to a tmp path
+        # test VLLM_NCCL_SO_PATH
+        fd, path = tempfile.mkstemp()
+        with open(path, 'wb') as f:
+            f.write(content)
+        p = multiprocessing.Process(target=target_fn,
+                                    args=({
+                                        "VLLM_NCCL_SO_PATH": path
+                                    }, path))
+        p.start()
+        p.join()
+        assert p.exitcode == 0
+    finally:
+        with open(so_file, 'wb') as f:
+            f.write(content)
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 0a8bb860efa1c..c57a4f59d442c 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -21,8 +21,7 @@
 
 import ctypes
 import datetime
-import glob
-import os
+import platform
 
 # ===================== import region =====================
 import torch
@@ -30,40 +29,27 @@
 from torch.distributed import ReduceOp
 
 from vllm.logger import init_logger
+from vllm.utils import find_nccl_library, nccl_integrity_check
 
 logger = init_logger(__name__)
 
-so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
-
-# check if we have vllm-managed nccl
-vllm_nccl_path = None
-if torch.version.cuda is not None:
-    cuda_major = torch.version.cuda.split(".")[0]
-    path = os.path.expanduser(
-        f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
-    files = glob.glob(path)
-    vllm_nccl_path = files[0] if files else None
-
-# manually load the nccl library
-if so_file:
-    logger.info(
-        f"Loading nccl from environment variable VLLM_NCCL_SO_PATH={so_file}")
-else:
-    if torch.version.cuda is not None:
-        so_file = vllm_nccl_path or "libnccl.so.2"
-    elif torch.version.hip is not None:
-        so_file = "librccl.so.1"
-    else:
-        raise ValueError("NCCL only supports CUDA and ROCm backends.")
-    logger.info(f"Loading nccl from library {so_file}")
+so_file = find_nccl_library()
 
 try:
+    # load the library in another process.
+    # if it core dumps, it will not crash the current process
+    nccl_integrity_check(so_file)
     nccl = ctypes.CDLL(so_file)
 except Exception as e:
     logger.error(
         f"Failed to load NCCL library from {so_file} ."
         "It is expected if you are not running on NVIDIA/AMD GPUs."
-        "Otherwise please set the environment variable VLLM_NCCL_SO_PATH"
+        "Otherwise, the nccl library might not exist, be corrupted "
+        f"or it does not support the current platform {platform.platform()}."
+        f"One solution is to download libnccl2 version 2.18 from "
+        f"https://developer.download.nvidia.com/compute/cuda/repos/ "
+        f"and extract the libnccl.so.2 file. If you already have the "
+        f"library, please set the environment variable VLLM_NCCL_SO_PATH"
         " to point to the correct nccl library path.")
     raise e
 
diff --git a/vllm/utils.py b/vllm/utils.py
index dabd96e7489d8..055f33e63e628 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,6 +1,7 @@
 import asyncio
 import enum
 import gc
+import glob
 import os
 import socket
 import subprocess
@@ -517,3 +518,53 @@ def init_cached_hf_modules():
     """
     from transformers.dynamic_module_utils import init_hf_modules
     init_hf_modules()
+
+
+def nccl_integrity_check(filepath):
+    """
+    when the library is corrupted, we cannot catch
+    the exception in python. it will crash the process.
+    instead, we use the exit code of `ldd` to check
+    if the library is corrupted. if not, we will return
+    the version of the library.
+    """
+    exit_code = os.system(f"ldd {filepath} 2>&1 > /dev/null")
+    if exit_code != 0:
+        raise RuntimeError(f"Failed to load NCCL library from {filepath} .")
+    import ctypes
+
+    nccl = ctypes.CDLL(filepath)
+    version = ctypes.c_int()
+    nccl.ncclGetVersion.restype = ctypes.c_int
+    nccl.ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
+    result = nccl.ncclGetVersion(ctypes.byref(version))
+    assert result == 0
+    return version.value
+
+
+def find_nccl_library():
+    so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
+
+    # check if we have vllm-managed nccl
+    vllm_nccl_path = None
+    if torch.version.cuda is not None:
+        cuda_major = torch.version.cuda.split(".")[0]
+        path = os.path.expanduser(
+            f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
+        files = glob.glob(path)
+        vllm_nccl_path = files[0] if files else None
+
+    # manually load the nccl library
+    if so_file:
+        logger.info(
+            f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}"
+        )
+    else:
+        if torch.version.cuda is not None:
+            so_file = vllm_nccl_path or "libnccl.so.2"
+        elif torch.version.hip is not None:
+            so_file = "librccl.so.1"
+        else:
+            raise ValueError("NCCL only supports CUDA and ROCm backends.")
+        logger.info(f"Found nccl from library {so_file}")
+    return so_file

From 05086c1f889f52f9ad93186e1b19b642d2a9b991 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Apr 2024 08:16:26 +0100
Subject: [PATCH 036/106] Allow model to be served under multiple names (#2894)

Co-authored-by: Alexandre Payot <alexandrep@graphcore.ai>
---
 vllm/entrypoints/openai/api_server.py         |  8 ++++----
 vllm/entrypoints/openai/cli_args.py           | 10 +++++++---
 vllm/entrypoints/openai/serving_chat.py       |  8 ++++----
 vllm/entrypoints/openai/serving_completion.py |  6 +++---
 vllm/entrypoints/openai/serving_engine.py     | 15 ++++++++-------
 5 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 32282bfd8d12b..d6673976bb775 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -150,18 +150,18 @@ async def authentication(request: Request, call_next):
     logger.info(f"args: {args}")
 
     if args.served_model_name is not None:
-        served_model = args.served_model_name
+        served_model_names = args.served_model_name
     else:
-        served_model = args.model
+        served_model_names = [args.model]
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngine.from_engine_args(
         engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
-    openai_serving_chat = OpenAIServingChat(engine, served_model,
+    openai_serving_chat = OpenAIServingChat(engine, served_model_names,
                                             args.response_role,
                                             args.lora_modules,
                                             args.chat_template)
     openai_serving_completion = OpenAIServingCompletion(
-        engine, served_model, args.lora_modules)
+        engine, served_model_names, args.lora_modules)
 
     app.root_path = args.root_path
     uvicorn.run(app,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index cc71931b97955..5c361b4d184ee 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -54,11 +54,15 @@ def make_arg_parser():
                         help="If provided, the server will require this key "
                         "to be presented in the header.")
     parser.add_argument("--served-model-name",
+                        nargs="+",
                         type=str,
                         default=None,
-                        help="The model name used in the API. If not "
-                        "specified, the model name will be the same as "
-                        "the huggingface name.")
+                        help="The model name(s) used in the API. If multiple "
+                        "names are provided, the server will respond to any "
+                        "of the provided names. The model name in the model "
+                        "field of a response will be the first name in this "
+                        "list. If not specified, the model name will be the "
+                        "same as the `--model` argument.")
     parser.add_argument(
         "--lora-modules",
         type=str,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index c9ed4a9de20f4..f35eab15bc487 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -24,12 +24,12 @@ class OpenAIServingChat(OpenAIServing):
 
     def __init__(self,
                  engine: AsyncLLMEngine,
-                 served_model: str,
+                 served_model_names: List[str],
                  response_role: str,
                  lora_modules: Optional[List[LoRA]] = None,
                  chat_template=None):
         super().__init__(engine=engine,
-                         served_model=served_model,
+                         served_model_names=served_model_names,
                          lora_modules=lora_modules)
         self.response_role = response_role
         self._load_chat_template(chat_template)
@@ -109,7 +109,7 @@ async def chat_completion_stream_generator(
             result_generator: AsyncIterator[RequestOutput], request_id: str
     ) -> Union[ErrorResponse, AsyncGenerator[str, None]]:
 
-        model_name = request.model
+        model_name = self.served_model_names[0]
         created_time = int(time.time())
         chunk_object_type = "chat.completion.chunk"
         first_iteration = True
@@ -251,7 +251,7 @@ async def chat_completion_full_generator(
             result_generator: AsyncIterator[RequestOutput],
             request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]:
 
-        model_name = request.model
+        model_name = self.served_model_names[0]
         created_time = int(time.time())
         final_res: RequestOutput = None
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a71f2d6a4426a..b7e2530a69b51 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -53,10 +53,10 @@ class OpenAIServingCompletion(OpenAIServing):
 
     def __init__(self,
                  engine: AsyncLLMEngine,
-                 served_model: str,
+                 served_model_names: List[str],
                  lora_modules: Optional[List[LoRA]] = None):
         super().__init__(engine=engine,
-                         served_model=served_model,
+                         served_model_names=served_model_names,
                          lora_modules=lora_modules)
 
     async def create_completion(self, request: CompletionRequest,
@@ -79,7 +79,7 @@ async def create_completion(self, request: CompletionRequest,
             return self.create_error_response(
                 "suffix is not currently supported")
 
-        model_name = request.model
+        model_name = self.served_model_names[0]
         request_id = f"cmpl-{random_uuid()}"
         created_time = int(time.time())
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 78b951c8ba161..5e1e7a3749edd 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -29,10 +29,10 @@ class OpenAIServing:
 
     def __init__(self,
                  engine: AsyncLLMEngine,
-                 served_model: str,
+                 served_model_names: List[str],
                  lora_modules=Optional[List[LoRA]]):
         self.engine = engine
-        self.served_model = served_model
+        self.served_model_names = served_model_names
         if lora_modules is None:
             self.lora_requests = []
         else:
@@ -82,13 +82,14 @@ async def _post_init(self):
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
-            ModelCard(id=self.served_model,
-                      root=self.served_model,
+            ModelCard(id=served_model_name,
+                      root=self.served_model_names[0],
                       permission=[ModelPermission()])
+            for served_model_name in self.served_model_names
         ]
         lora_cards = [
             ModelCard(id=lora.lora_name,
-                      root=self.served_model,
+                      root=self.served_model_names[0],
                       permission=[ModelPermission()])
             for lora in self.lora_requests
         ]
@@ -158,7 +159,7 @@ def create_streaming_error_response(
         return json_str
 
     async def _check_model(self, request) -> Optional[ErrorResponse]:
-        if request.model == self.served_model:
+        if request.model in self.served_model_names:
             return
         if request.model in [lora.lora_name for lora in self.lora_requests]:
             return
@@ -168,7 +169,7 @@ async def _check_model(self, request) -> Optional[ErrorResponse]:
             status_code=HTTPStatus.NOT_FOUND)
 
     def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
-        if request.model == self.served_model:
+        if request.model in self.served_model_names:
             return
         for lora in self.lora_requests:
             if request.model == lora.lora_name:

From d005abba3abb7dde86dc0d4f16a5d6cbce6edaa1 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 18 Apr 2024 03:21:55 -0400
Subject: [PATCH 037/106] [Bugfix] Get available quantization methods from
 quantization registry (#4098)

---
 benchmarks/benchmark_latency.py                     | 3 ++-
 benchmarks/benchmark_throughput.py                  | 4 +++-
 tests/models/test_marlin.py                         | 7 +++----
 vllm/config.py                                      | 7 ++++---
 vllm/engine/arg_utils.py                            | 3 ++-
 vllm/model_executor/layers/quantization/__init__.py | 7 ++++---
 6 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index aadbc441713fc..44da3bad8d840 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -9,6 +9,7 @@
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
 
 def main(args: argparse.Namespace):
@@ -101,7 +102,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
-                        choices=['awq', 'gptq', 'squeezellm', None],
+                        choices=[*QUANTIZATION_METHODS, None],
                         default=None)
     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
     parser.add_argument('--input-len', type=int, default=32)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6df1e1d628e6c..6bb889d1eceba 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -10,6 +10,8 @@
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
 
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
 
 def sample_requests(
     dataset_path: str,
@@ -267,7 +269,7 @@ def main(args: argparse.Namespace):
     parser.add_argument("--tokenizer", type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
-                        choices=['awq', 'gptq', 'squeezellm', None],
+                        choices=[*QUANTIZATION_METHODS, None],
                         default=None)
     parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
     parser.add_argument("--n",
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index eb225522447aa..3c2418fca972a 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -23,15 +23,14 @@
 import torch
 from compare_utils import check_logprobs_close
 
-from vllm.model_executor.layers.quantization import (
-    _QUANTIZATION_CONFIG_REGISTRY)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
 MAX_MODEL_LEN = 1024
 
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (
-    capability < _QUANTIZATION_CONFIG_REGISTRY["marlin"].get_min_capability())
+marlin_not_supported = (capability <
+                        QUANTIZATION_METHODS["marlin"].get_min_capability())
 
 
 @dataclass
diff --git a/vllm/config.py b/vllm/config.py
index 3008d08c9a8f0..708fdf3d1c487 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -9,6 +9,7 @@
 from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
                         is_neuron)
@@ -150,8 +151,8 @@ def _verify_sparsity(self) -> None:
                     f"({self.sparsity}).")
 
     def _verify_quantization(self) -> None:
-        supported_quantization = ["awq", "gptq", "squeezellm", "marlin"]
-        rocm_not_supported_quantization = ["awq", "marlin"]
+        supported_quantization = [*QUANTIZATION_METHODS]
+        rocm_supported_quantization = ["gptq", "squeezellm"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -187,7 +188,7 @@ def _verify_quantization(self) -> None:
                     f"Unknown quantization method: {self.quantization}. Must "
                     f"be one of {supported_quantization}.")
             if is_hip(
-            ) and self.quantization in rocm_not_supported_quantization:
+            ) and self.quantization not in rocm_supported_quantization:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not "
                     f"supported in ROCm.")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ffcd66eb59815..a2dfe0cbdc86e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -9,6 +9,7 @@
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          TokenizerPoolConfig, VisionLanguageConfig)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import str_to_int_tuple
 
 
@@ -290,7 +291,7 @@ def add_cli_args(
         parser.add_argument('--quantization',
                             '-q',
                             type=str,
-                            choices=['awq', 'gptq', 'squeezellm', None],
+                            choices=[*QUANTIZATION_METHODS, None],
                             default=EngineArgs.quantization,
                             help='Method used to quantize the weights. If '
                             'None, we first check the `quantization_config` '
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index ad988d48755b0..a3b89a66469eb 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -7,7 +7,7 @@
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 
-_QUANTIZATION_CONFIG_REGISTRY = {
+QUANTIZATION_METHODS = {
     "awq": AWQConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
@@ -16,12 +16,13 @@
 
 
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
-    if quantization not in _QUANTIZATION_CONFIG_REGISTRY:
+    if quantization not in QUANTIZATION_METHODS:
         raise ValueError(f"Invalid quantization method: {quantization}")
-    return _QUANTIZATION_CONFIG_REGISTRY[quantization]
+    return QUANTIZATION_METHODS[quantization]
 
 
 __all__ = [
     "QuantizationConfig",
     "get_quantization_config",
+    "QUANTIZATION_METHODS",
 ]

From e70ec2ff9bf849506045798cff6a7020d3b4600a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Moskal?= <michal@moskal.me>
Date: Thu, 18 Apr 2024 00:51:28 -0700
Subject: [PATCH 038/106] [Bugfix][Kernel] allow non-power-of-two head sizes in
 prefix prefill (#4128)

---
 tests/kernels/test_prefix_prefill.py |  2 +-
 vllm/attention/ops/prefix_prefill.py | 44 +++++++++++++++++-----------
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index cd303ee4a4448..d581a0c843b76 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -10,7 +10,7 @@
 
 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
-HEAD_SIZES = [128]
+HEAD_SIZES = [128, 96]
 DTYPES = [torch.float16]
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 70f09224f1cf6..4896cf3909c6e 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -47,7 +47,8 @@ def _fwd_kernel(
         stride_v_cache_bl,
         num_queries_per_kv: int,
         BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
+        BLOCK_DMODEL: tl.constexpr,  # head size
+        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
         BLOCK_N: tl.constexpr,
     ):
         cur_batch = tl.program_id(0)
@@ -59,26 +60,30 @@ def _fwd_kernel(
         cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
         cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
         cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+        cur_batch_query_len = cur_batch_seq_len - cur_batch_ctx_len
 
         block_start_loc = BLOCK_M * start_m
 
         # initialize offsets
         offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL)
+        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
         offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
         off_q = (
             (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-        q = tl.load(
-            Q + off_q,
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0)
+        dim_mask = tl.where(
+            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
+
+        q = tl.load(Q + off_q,
+                    mask=dim_mask[None, :] &
+                    (offs_m[:, None] < cur_batch_query_len),
+                    other=0.0)
 
         # # initialize pointer to m and l
         m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
         l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
 
         for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
             start_n = tl.multiple_of(start_n, BLOCK_N)
@@ -99,7 +104,8 @@ def _fwd_kernel(
                 offs_d[None, :] * stride_v_cache_d +
                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
             k = tl.load(K_cache + off_k,
-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                        mask=dim_mask[:, None] &
+                        ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -126,7 +132,8 @@ def _fwd_kernel(
             acc = acc * acc_scale[:, None]
             # update acc
             v = tl.load(V_cache + off_v,
-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                        mask=dim_mask[None, :] &
+                        ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
                         other=0.0)
 
             p = p.to(v.dtype)
@@ -142,16 +149,15 @@ def _fwd_kernel(
         k_ptrs = K + off_k
         v_ptrs = V + off_v
 
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+        block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
 
         for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
             start_n = tl.multiple_of(start_n, BLOCK_N)
             # -- compute qk ----
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=dim_mask[:, None] &
+                        ((start_n + offs_n[None, :]) < cur_batch_query_len),
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -179,8 +185,8 @@ def _fwd_kernel(
             # update acc
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=dim_mask[None, :] &
+                        ((start_n + offs_n[:, None]) < cur_batch_query_len),
                         other=0.0)
 
             p = p.to(v.dtype)
@@ -195,7 +201,8 @@ def _fwd_kernel(
         out_ptrs = Out + off_o
         tl.store(out_ptrs,
                  acc,
-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+                 mask=dim_mask[None, :] &
+                 (offs_m[:, None] < cur_batch_query_len))
         return
 
     @triton.jit
@@ -636,7 +643,8 @@ def context_attention_fwd(q,
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         assert Lq == Lk and Lk == Lv
-        assert Lk in {16, 32, 64, 128}
+        # round up Lk to a power of 2 - this is required for Triton block size
+        Lk_padded = 2**((Lk - 1).bit_length())
 
         sm_scale = 1.0 / (Lq**0.5)
         batch, head = b_seq_len.shape[0], q.shape[1]
@@ -646,6 +654,7 @@ def context_attention_fwd(q,
 
         num_warps = 8 if Lk <= 64 else 8
         if alibi_slopes is not None:
+            assert Lk == Lk_padded
             _fwd_kernel_alibi[grid](
                 q,
                 k,
@@ -738,6 +747,7 @@ def context_attention_fwd(q,
             num_queries_per_kv=num_queries_per_kv,
             BLOCK_M=BLOCK,
             BLOCK_DMODEL=Lk,
+            BLOCK_DMODEL_PADDED=Lk_padded,
             BLOCK_N=BLOCK,
             num_warps=num_warps,
             num_stages=1,

From 6b688e3857cfb065ee416a2235060a301af59cd8 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 18 Apr 2024 10:55:48 -0700
Subject: [PATCH 039/106] [Docs] document that Meta Llama 3 is supported
 (#4175)

---
 docs/source/models/supported_models.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 5e5ce871f61dd..951fc3aac0c75 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -80,8 +80,8 @@ Alongside each architecture, we include some popular models that use it.
     - :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
     -
   * - :code:`LlamaForCausalLM`
-    - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
-    - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+    - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
+    - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
     - ✅︎
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM

From fcd0b113d1c93a2922adda4da53d3938444d9e6e Mon Sep 17 00:00:00 2001
From: James Whedbee <jamesw@telnyx.com>
Date: Thu, 18 Apr 2024 16:12:55 -0500
Subject: [PATCH 040/106] [Bugfix] Support logprobs when using guided_json and
 other constrained decoding fields (#4149)

---
 tests/entrypoints/test_openai_server.py   | 30 +++++++++++++++++++++++
 vllm/entrypoints/openai/serving_engine.py |  4 ++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index bdb80056f3961..2e1f1f1a707a3 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -723,6 +723,36 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
             extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
 
 
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
+                                           guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(guided_choice=TEST_CHOICE,
+                        guided_decoding_backend=guided_decoding_backend))
+    top_logprobs = chat_completion.choices[0].logprobs.top_logprobs
+
+    # -9999.0 is the minimum logprob returned by OpenAI
+    assert all(
+        isinstance(logprob, float) and logprob >= -9999.0
+        for token_dict in top_logprobs
+        for token, logprob in token_dict.items())
+
+
 async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
     resp = await client.chat.completions.create(
         model=MODEL_NAME,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 5e1e7a3749edd..a95a39ce3008b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -124,7 +124,9 @@ def _create_logprobs(
 
                 if num_output_top_logprobs:
                     logprobs.top_logprobs.append({
-                        p.decoded_token: p.logprob
+                        # Convert float("-inf") to the
+                        # JSON-serializable float that OpenAI uses
+                        p.decoded_token: max(p.logprob, -9999.0)
                         for i, p in step_top_logprobs.items()
                     } if step_top_logprobs else None)
 

From 7b9df951cfaf655c463de4f81b8eb91416641587 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 18 Apr 2024 14:36:39 -0700
Subject: [PATCH 041/106] [Misc] Bump transformers to latest version (#4176)

---
 requirements-common.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index c1614d2537b25..3de0f98e7c0c3 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -5,7 +5,8 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy
 requests
 py-cpuinfo
-transformers >= 4.39.1  # Required for StarCoder2 & Llava.
+transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
+tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From ce724662fe66463f6f0b958c21e95f36ed733005 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Thu, 18 Apr 2024 15:26:01 -0700
Subject: [PATCH 042/106] [CI/CD] add neuron docker and ci test scripts (#3571)

---
 .buildkite/run-neuron-test.sh    | 37 ++++++++++++++++++++++++++++++++
 .buildkite/test-template.j2      |  5 +++++
 Dockerfile.neuron                | 36 +++++++++++++++++++++++++++++++
 setup.py                         |  3 ++-
 vllm/engine/async_llm_engine.py  |  4 ++--
 vllm/executor/neuron_executor.py | 22 ++++++++++++++++++-
 6 files changed, 103 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/run-neuron-test.sh
 create mode 100644 Dockerfile.neuron

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
new file mode 100644
index 0000000000000..8ba03b78e8dbf
--- /dev/null
+++ b/.buildkite/run-neuron-test.sh
@@ -0,0 +1,37 @@
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+
+# Try building the docker image
+aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+docker build -t neuron -f Dockerfile.neuron .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f neuron || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+    timeout=300
+    counter=0
+
+    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+        sleep 1
+        counter=$((counter + 1))
+        if [ $counter -ge $timeout ]; then
+            echo "Timeout after $timeout seconds"
+            break
+        fi
+    done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+    localhost:8000/generate \
+    -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 0e1acc9777d4b..fb1086db77823 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -21,6 +21,11 @@ steps:
       queue: amd
     command: bash .buildkite/run-amd-test.sh
 
+  - label: "Neuron Test"
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+
   - label: "CPU Test"
     command: bash .buildkite/run-cpu-test.sh
 
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
new file mode 100644
index 0000000000000..fe42b4ef393f1
--- /dev/null
+++ b/Dockerfile.neuron
@@ -0,0 +1,36 @@
+# default base image
+ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
+
+FROM $BASE_IMAGE
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/app
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+
+COPY ./vllm /app/vllm/vllm
+COPY ./setup.py /app/vllm/setup.py
+COPY ./requirements-common.txt /app/vllm/requirements-common.txt
+COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+
+RUN cd /app/vllm \
+    && python3 -m pip install -U -r requirements-neuron.txt
+
+ENV VLLM_BUILD_WITH_NEURON 1
+RUN cd /app/vllm \
+    && pip install -e . \
+    && cd ..
+
+CMD ["/bin/bash"]
diff --git a/setup.py b/setup.py
index 008272e77906c..c755bf8f94067 100644
--- a/setup.py
+++ b/setup.py
@@ -208,7 +208,8 @@ def _is_neuron() -> bool:
         subprocess.run(["neuron-ls"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
         torch_neuronx_installed = False
-    return torch_neuronx_installed
+    return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
+                                                     False)
 
 
 def _is_cpu() -> bool:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index c3020d2b38db0..c436ece83f65a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -335,8 +335,8 @@ def from_engine_args(
         engine_config = engine_args.create_engine_config()
 
         if engine_config.device_config.device_type == "neuron":
-            raise NotImplementedError("Neuron is not supported for "
-                                      "async engine yet.")
+            from vllm.executor.neuron_executor import NeuronExecutorAsync
+            executor_class = NeuronExecutorAsync
         elif engine_config.parallel_config.worker_use_ray:
             initialize_ray_cluster(engine_config.parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 7cc187e297c9f..5a137d1bdcb3b 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -1,9 +1,10 @@
 from typing import Dict, List, Set, Tuple
 
-from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import make_async
 
 logger = init_logger(__name__)
 
@@ -73,3 +74,22 @@ def check_health(self) -> None:
         # NeuronExecutor will always be healthy as long as
         # it's running.
         return
+
+
+class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model)(
+            seq_group_metadata_list=seq_group_metadata_list, )
+        return output
+
+    async def check_health_async(self) -> None:
+        # NeuronExecutor will always be healthy as long as
+        # it's running.
+        return

From 6a44497c5b80eb9a3bdead0afb885f1f90387dc2 Mon Sep 17 00:00:00 2001
From: Adam Tilghman <agt@ucsd.edu>
Date: Thu, 18 Apr 2024 15:32:47 -0700
Subject: [PATCH 043/106] [Bugfix] Fix CustomAllreduce nvlink topology
 detection (#3974)

[Bugfix] Fix CustomAllreduce pcie nvlink topology detection (#3974) (#4159)
---
 vllm/distributed/device_communicators/custom_all_reduce.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index f83caef879da3..7602897d3dd8f 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -145,8 +145,10 @@ def _is_full_nvlink(rank, world_size):
     for i in range(world_size):
         if i != rank:
             try:
-                link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i)
-                if not link_state:
+                peer_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                    handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                if p2p_status != pynvml.NVML_P2P_STATUS_OK:
                     return False
             except pynvml.NVMLError as error:
                 logger.info(

From 692816304d43a44b4243f96e88af7fe6c9704c37 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 18 Apr 2024 16:15:12 -0700
Subject: [PATCH 044/106] [Core] add an option to log every function call to
 for debugging hang/crash in distributed inference (#4079)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .buildkite/test-pipeline.yaml             |  2 +-
 .github/ISSUE_TEMPLATE/400-bug report.yml |  2 +
 tests/test_logger.py                      | 27 ++++++++++++
 vllm/executor/ray_gpu_executor.py         | 12 ++++--
 vllm/logger.py                            | 52 +++++++++++++++++++++++
 vllm/utils.py                             | 13 +++++-
 vllm/worker/worker_base.py                | 20 +++++++--
 7 files changed, 120 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_logger.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2263dee20fbed..0f920c7ec1442 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -40,7 +40,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
 
 - label: Engine Test
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py
+  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
 - label: Entrypoints Test
   commands:
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
index f1124dfa78bbc..c87f8fddcb776 100644
--- a/.github/ISSUE_TEMPLATE/400-bug report.yml	
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml	
@@ -57,6 +57,8 @@ body:
       If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
 
       Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+
+      If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
     placeholder: |
       A clear and concise description of what the bug is.
 
diff --git a/tests/test_logger.py b/tests/test_logger.py
new file mode 100644
index 0000000000000..601f72b50811c
--- /dev/null
+++ b/tests/test_logger.py
@@ -0,0 +1,27 @@
+import os
+import sys
+import tempfile
+
+from vllm.logger import enable_trace_function_call
+
+
+def f1(x):
+    return f2(x)
+
+
+def f2(x):
+    return x
+
+
+def test_trace_function_call():
+    fd, path = tempfile.mkstemp()
+    cur_dir = os.path.dirname(__file__)
+    enable_trace_function_call(path, cur_dir)
+    f1(1)
+    with open(path, 'r') as f:
+        content = f.read()
+
+    assert "f1" in content
+    assert "f2" in content
+    sys.settrace(None)
+    os.remove(path)
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 5a43f1fc28a84..f779b0f8a5113 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -10,7 +10,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
+                        get_vllm_instance_id, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -133,12 +133,18 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         for node_id, gpu_ids in node_gpus.items():
             node_gpus[node_id] = sorted(gpu_ids)
 
-        # Set CUDA_VISIBLE_DEVICES for the driver and workers.
+        VLLM_INSTANCE_ID = get_vllm_instance_id()
+
+        # Set environment variables for the driver and workers.
         all_args_to_update_environment_variables = []
         for (node_id, _) in worker_node_and_gpu_ids:
             all_args_to_update_environment_variables.append([{
                 "CUDA_VISIBLE_DEVICES":
-                ",".join(map(str, node_gpus[node_id]))
+                ",".join(map(str, node_gpus[node_id])),
+                "VLLM_INSTANCE_ID":
+                VLLM_INSTANCE_ID,
+                "VLLM_TRACE_FUNCTION":
+                os.getenv("VLLM_TRACE_FUNCTION", "0"),
             }])
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
diff --git a/vllm/logger.py b/vllm/logger.py
index af9575085ef37..046f0e9099a4b 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,9 +1,11 @@
 # Adapted from
 # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
 """Logging configuration for vLLM."""
+import datetime
 import logging
 import os
 import sys
+from functools import partial
 from typing import Optional
 
 VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
@@ -65,3 +67,53 @@ def init_logger(name: str):
         logger.addHandler(_default_handler)
         logger.propagate = False
     return logger
+
+
+logger = init_logger(__name__)
+
+
+def _trace_calls(log_path, root_dir, frame, event, arg=None):
+    if event in ['call', 'return']:
+        # Extract the filename, line number, function name, and the code object
+        filename = frame.f_code.co_filename
+        lineno = frame.f_lineno
+        func_name = frame.f_code.co_name
+        if not filename.startswith(root_dir):
+            # only log the functions in the vllm root_dir
+            return
+        # Log every function call or return
+        try:
+            with open(log_path, 'a') as f:
+                if event == 'call':
+                    f.write(f"{datetime.datetime.now()} Call to"
+                            f" {func_name} in {filename}:{lineno}\n")
+                else:
+                    f.write(f"{datetime.datetime.now()} Return from"
+                            f" {func_name} in {filename}:{lineno}\n")
+        except NameError:
+            # modules are deleted during shutdown
+            pass
+    return partial(_trace_calls, log_path, root_dir)
+
+
+def enable_trace_function_call(log_file_path: str,
+                               root_dir: Optional[str] = None):
+    """
+    Enable tracing of every function call in code under `root_dir`.
+    This is useful for debugging hangs or crashes.
+    `log_file_path` is the path to the log file.
+    `root_dir` is the root directory of the code to trace. If None, it is the
+    vllm root directory.
+
+    Note that this call is thread-level, any threads calling this function
+    will have the trace enabled. Other threads will not be affected.
+    """
+    logger.warning(
+        "VLLM_TRACE_FUNCTION is enabled. It will record every"
+        " function executed by Python. This will slow down the code. It "
+        "is suggested to be used for debugging hang or crashes only.")
+    logger.info(f"Trace frame log is saved to {log_file_path}")
+    if root_dir is None:
+        # by default, this is the vllm root directory
+        root_dir = os.path.dirname(os.path.dirname(__file__))
+    sys.settrace(partial(_trace_calls, log_file_path, root_dir))
diff --git a/vllm/utils.py b/vllm/utils.py
index 055f33e63e628..3054f3d27d1d6 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -163,6 +163,17 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
+@lru_cache(maxsize=None)
+def get_vllm_instance_id():
+    """
+    If the environment variable VLLM_INSTANCE_ID is set, return it.
+    Otherwise, return a random UUID.
+    Instance id represents an instance of the VLLM. All processes in the same
+    instance should have the same instance id.
+    """
+    return os.environ.get("VLLM_INSTANCE_ID", f"vllm-instance-{random_uuid()}")
+
+
 @lru_cache(maxsize=None)
 def in_wsl() -> bool:
     # Reference: https://github.com/microsoft/WSL/issues/4071
@@ -274,7 +285,7 @@ def get_open_port() -> int:
 
 def update_environment_variables(envs: Dict[str, str]):
     for k, v in envs.items():
-        if k in os.environ:
+        if k in os.environ and os.environ[k] != v:
             logger.warning(f"Overwriting environment variable {k} "
                            f"from '{os.environ[k]}' to '{v}'")
         os.environ[k] = v
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 13e062fe64b29..783dff3a43404 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,12 +1,15 @@
+import datetime
 import importlib
 import os
+import tempfile
+import threading
 from abc import ABC, abstractmethod
 from typing import Dict, List, Set, Tuple
 
-from vllm.logger import init_logger
+from vllm.logger import enable_trace_function_call, init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
-from vllm.utils import update_environment_variables
+from vllm.utils import get_vllm_instance_id, update_environment_variables
 
 logger = init_logger(__name__)
 
@@ -115,9 +118,20 @@ def update_environment_variables(self, envs: Dict[str, str]) -> None:
 
     def init_worker(self, *args, **kwargs):
         """
-        Actual initialization of the worker class.
+        Actual initialization of the worker class, and set up
+       function tracing if required.
         Arguments are passed to the worker class constructor.
         """
+        if int(os.getenv("VLLM_TRACE_FUNCTION", "0")):
+            tmp_dir = tempfile.gettempdir()
+            filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
+                        f"_thread_{threading.get_ident()}_"
+                        f"at_{datetime.datetime.now()}.log").replace(" ", "_")
+            log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(),
+                                    filename)
+            os.makedirs(os.path.dirname(log_path), exist_ok=True)
+            enable_trace_function_call(log_path)
+
         mod = importlib.import_module(self.worker_module_name)
         worker_class = getattr(mod, self.worker_class_name)
         self.worker = worker_class(*args, **kwargs)

From e726a89b62a40476bdf23b707e71abb84a21a84f Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 18 Apr 2024 21:13:36 -0700
Subject: [PATCH 045/106] Support eos_token_id from generation_config.json
 (#4182)

---
 vllm/engine/llm_engine.py | 19 +++++++++++++++++--
 vllm/sampling_params.py   | 14 +++++++++++++-
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ca788d37c8f40..a7537c8761481 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,7 +1,7 @@
 import time
 from typing import Iterable, List, Optional, Type, Union
 
-from transformers import PreTrainedTokenizer
+from transformers import GenerationConfig, PreTrainedTokenizer
 
 import vllm
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
@@ -34,6 +34,17 @@
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
 
+def _load_generation_config_dict(model_config: ModelConfig):
+    try:
+        return GenerationConfig.from_pretrained(
+            model_config.model,
+            revision=model_config.revision,
+        ).to_diff_dict()
+    except OSError:
+        # Not found.
+        return {}
+
+
 class LLMEngine:
     """An LLM engine that receives requests and generates texts.
 
@@ -126,6 +137,8 @@ def __init__(
         self._init_tokenizer()
         self.detokenizer = Detokenizer(self.tokenizer)
         self.seq_counter = Counter()
+        self.generation_config_fields = _load_generation_config_dict(
+            model_config)
 
         self.model_executor = executor_class(
             model_config=model_config,
@@ -393,6 +406,8 @@ def add_request(
         # inject the eos token id into the sampling_params to support min_tokens
         # processing
         sampling_params.eos_token_id = seq.eos_token_id
+        sampling_params.update_from_generation_config(
+            self.generation_config_fields)
 
         # Create the sequence group.
         seq_group = SequenceGroup(request_id, [seq], sampling_params,
@@ -437,7 +452,7 @@ def _process_model_outputs(
             scheduled_seq_groups: List[SequenceGroup],
             ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]:
         """Apply the model output to the sequences in the scheduled seq groups.
-        
+
         Returns RequestOutputs that can be returned to the client.
         """
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 53a38b25bfdac..dc0e60344d858 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -2,7 +2,7 @@
 import copy
 from enum import IntEnum
 from functools import cached_property
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from pydantic import Field
@@ -271,6 +271,18 @@ def _verify_greedy_sampling(self) -> None:
             raise ValueError("best_of must be 1 when using greedy sampling."
                              f"Got {self.best_of}.")
 
+    def update_from_generation_config(
+            self, generation_config: Dict[str, Any]) -> None:
+        """Update if there are non-default values from generation_config"""
+        # Update eos_token_id for generation
+        if eos_ids := generation_config.get("eos_token_id"):
+            # it can be either int or list of int
+            if isinstance(eos_ids, int):
+                eos_ids = [eos_ids]
+            original_stop_token_ids = set(self.stop_token_ids)
+            original_stop_token_ids.update(eos_ids)
+            self.stop_token_ids = list(original_stop_token_ids)
+
     @cached_property
     def sampling_type(self) -> SamplingType:
         if self.use_beam_search:

From 643d8d1eac3dc08623197cf312994c6541536fa7 Mon Sep 17 00:00:00 2001
From: Jee Li <pandaleefree@163.com>
Date: Fri, 19 Apr 2024 15:59:54 +0800
Subject: [PATCH 046/106] [Bugfix] Fix LoRA loading check (#4138)

Co-authored-by: simon-mo <simon.mo@hey.com>
---
 tests/lora/conftest.py              |  6 ++++++
 tests/lora/test_lora_checkpoints.py | 22 ++++++++++++++++++++--
 vllm/lora/models.py                 |  4 +++-
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 2dabfb6b4337c..a3ffc53d8cd1d 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -143,6 +143,12 @@ def baichuan_lora_files():
     return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
 
 
+@pytest.fixture(scope="session")
+def baichuan_zero_lora_files():
+    # all the lora_B weights are initialized to zero.
+    return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
+
+
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 35ad7342944cd..d4d1665b624ea 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -3,9 +3,16 @@
 from vllm.lora.models import LoRAModel
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 
+lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
 
-@pytest.mark.parametrize("lora_name", ["baichuan7B", "chatglm3-6b"])
-def test_load_checkpoints(lora_name, chatglm3_lora_files, baichuan_lora_files):
+
+@pytest.mark.parametrize("lora_name", lora_lst)
+def test_load_checkpoints(
+    lora_name,
+    baichuan_lora_files,
+    baichuan_zero_lora_files,
+    chatglm3_lora_files,
+):
     supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
@@ -26,6 +33,17 @@ def test_load_checkpoints(lora_name, chatglm3_lora_files, baichuan_lora_files):
             device="cpu",
             embedding_modules=embedding_modules,
             embedding_padding_modules=embed_padding_modules)
+    elif lora_name == "baichuan7B-zero":
+        #Test that the target_modules contain prefix
+        # such as "model.layers.0.self_atten.W_pack", and
+        # the test should pass.
+        LoRAModel.from_local_checkpoint(
+            baichuan_zero_lora_files,
+            expected_lora_modules,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
         # and the test should raise the following error.
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 62f1502458008..6bb9fee27d535 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -212,7 +212,9 @@ def from_local_checkpoint(
         target_modules = config["target_modules"]
         unexpected_modules = []
         for module in target_modules:
-            if module not in expected_lora_modules:
+            # Compatible with more modules, such as:layers.11.self_attn.k_proj
+            part_name = module.split(".")[-1]
+            if part_name not in expected_lora_modules:
                 unexpected_modules.append(module)
         # loaded lora's target modules must be a subset of expected_lora_modules
         if unexpected_modules:

From 39887d55cf093717857636354e24c8e23e699782 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 19 Apr 2024 01:00:22 -0700
Subject: [PATCH 047/106] Bump version of 0.4.1 (#4177)


From d857aa09de24f4891e40f3dcbcefb3e3fe86f842 Mon Sep 17 00:00:00 2001
From: Uranus <109661872+UranusSeven@users.noreply.github.com>
Date: Fri, 19 Apr 2024 16:18:33 +0800
Subject: [PATCH 048/106] [Misc] fix docstrings (#4191)

Co-authored-by: Zhong Wang <wangzhong@infini-ai.com>
---
 vllm/sequence.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 92362a9a5d2a3..7dcacab6f2ab6 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -160,7 +160,7 @@ def reset_state_for_recompute(self) -> None:
         self._stage = SequenceStage.PREFILL
 
     def get_num_uncomputed_tokens(self) -> int:
-        """Return the number of prefil tokens that are not computed."""
+        """Return the number of prefill tokens that are not computed."""
         # we use `get_len()` which includes prompt_len + output_len instead
         # of prompt_len here. This is because during recompute we need to
         # prefill for both prompt and output.
@@ -345,12 +345,9 @@ def fork(self, new_seq_id: int) -> "Sequence":
     def get_num_new_tokens(self) -> int:
         """Get the number of new tokens to be computed.
 
-        Args:
-            remainig_token_budget: The remaining token budgets.
         Returns:
-            The new number of tokens to be computed. I.e., 1 for decode, prompt
-            size for prefill. If there's not enough remainig_token_budget, it
-            can return the chunked number of new tokens.
+            The new number of tokens to be computed. I.e., 1 for decode, or
+            the remaining prompt size for prefill.
         """
         if self.data.stage == SequenceStage.DECODE:
             return 1

From 93b20dbf63f8a1da8c7af95bd3a24510837a1d54 Mon Sep 17 00:00:00 2001
From: Ronen Schaffer <ronen.schaffer@ibm.com>
Date: Fri, 19 Apr 2024 18:08:26 +0300
Subject: [PATCH 049/106] [Bugfix][Core] Restore logging of stats in the async
 engine (#4150)

---
 vllm/engine/async_llm_engine.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index c436ece83f65a..ca4ba66f09cb8 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -217,10 +217,16 @@ async def step_async(self) -> List[RequestOutput]:
         else:
             output = []
 
-        return self._process_model_outputs(
+        request_outputs = self._process_model_outputs(
             output, scheduler_outputs.scheduled_seq_groups,
             scheduler_outputs.ignored_seq_groups)
 
+        # Log stats.
+        if self.log_stats:
+            self.stat_logger.log(self._get_stats(scheduler_outputs))
+
+        return request_outputs
+
     async def encode_request_async(
         self,
         request_id: str,  # pylint: disable=unused-argument

From 9d0b98046a8c501c7abec97bc288ca77bda3064e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 19 Apr 2024 12:44:51 -0700
Subject: [PATCH 050/106] [Misc] add nccl in collect env (#4211)

---
 .github/ISSUE_TEMPLATE/200-installation.yml           | 1 +
 .github/ISSUE_TEMPLATE/300-usage.yml                  | 1 +
 .github/ISSUE_TEMPLATE/400-bug report.yml             | 1 +
 .github/ISSUE_TEMPLATE/700-performance discussion.yml | 1 +
 collect_env.py                                        | 2 ++
 5 files changed, 6 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
index 4c6c96187cc6c..df41ade8c3c01 100644
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -18,6 +18,7 @@ body:
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       ```text
       The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index 88227b4b2e7b9..54763af1058f6 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -18,6 +18,7 @@ body:
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       ```text
       The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
index c87f8fddcb776..08120ad8e5a60 100644
--- a/.github/ISSUE_TEMPLATE/400-bug report.yml	
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml	
@@ -18,6 +18,7 @@ body:
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       ```text
       The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
index 9e8e7b4aa3530..4f8843420a94e 100644
--- a/.github/ISSUE_TEMPLATE/700-performance discussion.yml	
+++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml	
@@ -39,6 +39,7 @@ body:
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       ```text
       The output of `python collect_env.py`
diff --git a/collect_env.py b/collect_env.py
index e09abfed3dfba..c89f8c64eddcb 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -63,6 +63,7 @@
     "magma",
     "triton",
     "optree",
+    "nccl",
 }
 
 DEFAULT_PIP_PATTERNS = {
@@ -73,6 +74,7 @@
     "triton",
     "optree",
     "onnx",
+    "nccl",
     # UPSTREAM SYNC: needed for sparsity
     "nm-magic-wand-nightly",
 }

From a1fe28b364a153a0bcec77a73e8eb588a2a20eef Mon Sep 17 00:00:00 2001
From: Chirag Jain <jain.chirag925@gmail.com>
Date: Sat, 20 Apr 2024 05:43:56 +0530
Subject: [PATCH 051/106] Pass `tokenizer_revision` when getting tokenizer in
 openai serving (#4214)

---
 vllm/entrypoints/openai/serving_engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index a95a39ce3008b..81a15417c17ee 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -68,6 +68,7 @@ async def _post_init(self):
         self.tokenizer = get_tokenizer(
             engine_model_config.tokenizer,
             tokenizer_mode=engine_model_config.tokenizer_mode,
+            tokenizer_revision=engine_model_config.tokenizer_revision,
             trust_remote_code=engine_model_config.trust_remote_code,
             truncation_side="left")
 

From 58b01e511802b26fe3e2f6760af66fd9e7f307b8 Mon Sep 17 00:00:00 2001
From: Ayush Rautwar <42046470+ayusher@users.noreply.github.com>
Date: Fri, 19 Apr 2024 23:49:22 -0400
Subject: [PATCH 052/106] [Bugfix] Add fix for JSON whitespace (#4189)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-13-147.ec2.internal>
---
 tests/entrypoints/test_openai_server.py       | 27 ++++++++++---------
 .../outlines_logits_processors.py             |  5 ++++
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 2e1f1f1a707a3..5c416a12555b5 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -754,19 +754,20 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
 
 
 async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
-    resp = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=[{
-            "role":
-            "user",
-            "content": ('what is 1+1? please respond with a JSON object, '
-                        'the format is {"result": 2}')
-        }],
-        response_format={"type": "json_object"})
-
-    content = resp.choices[0].message.content
-    loaded = json.loads(content)
-    assert loaded == {"result": 2}, loaded
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role":
+                "user",
+                "content": ('what is 1+1? please respond with a JSON object, '
+                            'the format is {"result": 2}')
+            }],
+            response_format={"type": "json_object"})
+
+        content = resp.choices[0].message.content
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
 
 
 async def test_guided_grammar(server, client: openai.AsyncOpenAI):
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 95a67b612f08b..25ab5bf8b6a9c 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -131,6 +131,11 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
         fsm = CFGFSM(cfg, tokenizer)
         self.fsm = fsm
 
+    def init_state(self):
+        """Initialize state with a CFGFSM copy."""
+        super().init_state()
+        self.fsm = self.fsm.copy()
+
 
 @lru_cache
 def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):

From dc1d3f7ca98ad393a23531956feb9a5df9de41ce Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 20 Apr 2024 04:51:33 +0100
Subject: [PATCH 053/106] Fix missing docs and out of sync `EngineArgs` (#4219)

Co-authored-by: Harry Mellor <hmellor@oxts.com>
---
 docs/source/conf.py                |   2 +
 docs/source/models/engine_args.rst | 134 ++----------------------
 vllm/engine/arg_utils.py           | 159 ++++++++++++++++-------------
 3 files changed, 98 insertions(+), 197 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 19cc8557a7541..cfa956b143ba3 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -11,12 +11,14 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 
 import logging
+import os
 import sys
 from typing import List
 
 from sphinx.ext import autodoc
 
 logger = logging.getLogger(__name__)
+sys.path.append(os.path.abspath("../.."))
 
 # -- Project information -----------------------------------------------------
 
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index 235cb4e128c99..92bc7e0e843ed 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -5,133 +5,17 @@ Engine Arguments
 
 Below, you can find an explanation of every engine argument for vLLM:
 
-.. option:: --model <model_name_or_path>
-
-    Name or path of the huggingface model to use.
-
-.. option:: --tokenizer <tokenizer_name_or_path>
-
-    Name or path of the huggingface tokenizer to use.
-
-.. option:: --revision <revision>
-
-    The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
-
-.. option:: --tokenizer-revision <revision>
-
-    The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
-
-.. option:: --tokenizer-mode {auto,slow}
-
-    The tokenizer mode.
-    
-    * "auto" will use the fast tokenizer if available.
-    * "slow" will always use the slow tokenizer.
-
-.. option:: --trust-remote-code
-
-    Trust remote code from huggingface.
-
-.. option:: --download-dir <directory>
-
-    Directory to download and load the weights, default to the default cache dir of huggingface.
-
-.. option:: --load-format {auto,pt,safetensors,npcache,dummy,tensorizer}
-
-    The format of the model weights to load.
-
-    * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
-    * "pt" will load the weights in the pytorch bin format.
-    * "safetensors" will load the weights in the safetensors format.
-    * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
-    * "dummy" will initialize the weights with random values, mainly for profiling.
-    * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_ See `examples/tensorize_vllm_model.py <https://github.com/vllm-project/vllm/blob/main/examples/tensorize_vllm_model.py>`_ to serialize a vLLM model, and for more information.
-
-.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
-
-    Data type for model weights and activations.
-
-    * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
-    * "half" for FP16. Recommended for AWQ quantization.
-    * "float16" is the same as "half".
-    * "bfloat16" for a balance between precision and range.
-    * "float" is shorthand for FP32 precision.
-    * "float32" for FP32 precision.
-
-.. option:: --max-model-len <length>
-
-    Model context length. If unspecified, will be automatically derived from the model config.
-
-.. option:: --worker-use-ray
-
-    Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
-
-.. option:: --pipeline-parallel-size (-pp) <size>
-
-    Number of pipeline stages.
-
-.. option:: --tensor-parallel-size (-tp) <size>
-
-    Number of tensor parallel replicas.
-
-.. option:: --max-parallel-loading-workers <workers>
-
-    Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
-
-.. option:: --block-size {8,16,32}
-
-    Token block size for contiguous chunks of tokens.
-
-.. option:: --enable-prefix-caching
-
-    Enables automatic prefix caching
-
-.. option:: --seed <seed>
-
-    Random seed for operations.
-
-.. option:: --swap-space <size>
-
-    CPU swap space size (GiB) per GPU.
-
-.. option:: --gpu-memory-utilization <fraction>
-
-    The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. 
-    For example, a value of 0.5 would imply 50% GPU memory utilization.
-    If unspecified, will use the default value of 0.9.
-
-.. option:: --max-num-batched-tokens <tokens>
-
-    Maximum number of batched tokens per iteration.
-
-.. option:: --max-num-seqs <sequences>
-
-    Maximum number of sequences per iteration.
-
-.. option:: --max-paddings <paddings>
-
-    Maximum number of paddings in a batch.
-
-.. option:: --disable-log-stats
-
-    Disable logging statistics.
-
-.. option:: --quantization (-q) {awq,squeezellm,None}
-
-    Method used to quantize the weights.
+.. argparse::
+    :module: vllm.engine.arg_utils
+    :func: _engine_args_parser
+    :prog: -m vllm.entrypoints.openai.api_server
 
 Async Engine Arguments
 ----------------------
-Below are the additional arguments related to the asynchronous engine:
-
-.. option:: --engine-use-ray
 
-    Use Ray to start the LLM engine in a separate process as the server process.
-
-.. option:: --disable-log-requests
-
-    Disable logging requests.
-    
-.. option:: --max-log-len
+Below are the additional arguments related to the asynchronous engine:
 
-    Max number of prompt characters or prompt ID numbers being printed in log. Defaults to unlimited.
\ No newline at end of file
+.. argparse::
+    :module: vllm.engine.arg_utils
+    :func: _async_engine_args_parser
+    :prog: -m vllm.entrypoints.openai.api_server
\ No newline at end of file
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a2dfe0cbdc86e..0d00a695679a1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -86,57 +86,55 @@ def add_cli_args(
             parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         """Shared CLI arguments for vLLM engine."""
 
-        # NOTE: If you update any of the arguments below, please also
-        # make sure to update docs/source/models/engine_args.rst
-
         # Model arguments
         parser.add_argument(
             '--model',
             type=str,
             default='facebook/opt-125m',
-            help='name or path of the huggingface model to use')
+            help='Name or path of the huggingface model to use.')
         parser.add_argument(
             '--tokenizer',
             type=str,
             default=EngineArgs.tokenizer,
-            help='name or path of the huggingface tokenizer to use')
+            help='Name or path of the huggingface tokenizer to use.')
         parser.add_argument(
             '--revision',
             type=str,
             default=None,
-            help='the specific model version to use. It can be a branch '
+            help='The specific model version to use. It can be a branch '
             'name, a tag name, or a commit id. If unspecified, will use '
             'the default version.')
         parser.add_argument(
             '--code-revision',
             type=str,
             default=None,
-            help='the specific revision to use for the model code on '
+            help='The specific revision to use for the model code on '
             'Hugging Face Hub. It can be a branch name, a tag name, or a '
             'commit id. If unspecified, will use the default version.')
         parser.add_argument(
             '--tokenizer-revision',
             type=str,
             default=None,
-            help='the specific tokenizer version to use. It can be a branch '
+            help='The specific tokenizer version to use. It can be a branch '
             'name, a tag name, or a commit id. If unspecified, will use '
             'the default version.')
-        parser.add_argument('--tokenizer-mode',
-                            type=str,
-                            default=EngineArgs.tokenizer_mode,
-                            choices=['auto', 'slow'],
-                            help='tokenizer mode. "auto" will use the fast '
-                            'tokenizer if available, and "slow" will '
-                            'always use the slow tokenizer.')
+        parser.add_argument(
+            '--tokenizer-mode',
+            type=str,
+            default=EngineArgs.tokenizer_mode,
+            choices=['auto', 'slow'],
+            help='The tokenizer mode.\n\n* "auto" will use the '
+            'fast tokenizer if available.\n* "slow" will '
+            'always use the slow tokenizer.')
         parser.add_argument('--trust-remote-code',
                             action='store_true',
-                            help='trust remote code from huggingface')
+                            help='Trust remote code from huggingface.')
         parser.add_argument('--download-dir',
                             type=str,
                             default=EngineArgs.download_dir,
-                            help='directory to download and load the weights, '
+                            help='Directory to download and load the weights, '
                             'default to the default cache dir of '
-                            'huggingface')
+                            'huggingface.')
         parser.add_argument(
             '--load-format',
             type=str,
@@ -144,19 +142,19 @@ def add_cli_args(
             choices=[
                 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer'
             ],
-            help='The format of the model weights to load. '
-            '"auto" will try to load the weights in the safetensors format '
+            help='The format of the model weights to load.\n\n'
+            '* "auto" will try to load the weights in the safetensors format '
             'and fall back to the pytorch bin format if safetensors format '
-            'is not available. '
-            '"pt" will load the weights in the pytorch bin format. '
-            '"safetensors" will load the weights in the safetensors format. '
-            '"npcache" will load the weights in pytorch format and store '
-            'a numpy cache to speed up the loading. '
-            '"dummy" will initialize the weights with random values, '
-            'which is mainly for profiling.'
-            '"tensorizer" will load the weights using tensorizer from CoreWeave'
-            'which assumes tensorizer_uri is set to the location of the '
-            'serialized weights.')
+            'is not available.\n'
+            '* "pt" will load the weights in the pytorch bin format.\n'
+            '* "safetensors" will load the weights in the safetensors format.\n'
+            '* "npcache" will load the weights in pytorch format and store '
+            'a numpy cache to speed up the loading.\n'
+            '* "dummy" will initialize the weights with random values, '
+            'which is mainly for profiling.\n'
+            '* "tensorizer" will load the weights using tensorizer from '
+            'CoreWeave which assumes tensorizer_uri is set to the location of '
+            'the serialized weights.')
         parser.add_argument(
             '--dtype',
             type=str,
@@ -164,10 +162,14 @@ def add_cli_args(
             choices=[
                 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
             ],
-            help='data type for model weights and activations. '
-            'The "auto" option will use FP16 precision '
-            'for FP32 and FP16 models, and BF16 precision '
-            'for BF16 models.')
+            help='Data type for model weights and activations.\n\n'
+            '* "auto" will use FP16 precision for FP32 and FP16 models, and '
+            'BF16 precision for BF16 models.\n'
+            '* "half" for FP16. Recommended for AWQ quantization.\n'
+            '* "float16" is the same as "half".\n'
+            '* "bfloat16" for a balance between precision and range.\n'
+            '* "float" is shorthand for FP32 precision.\n'
+            '* "float32" for FP32 precision.')
         parser.add_argument(
             '--kv-cache-dtype',
             type=str,
@@ -176,7 +178,7 @@ def add_cli_args(
             help='Data type for kv cache storage. If "auto", will use model '
             'data type. FP8_E5M2 (without scaling) is only supported on cuda '
             'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
-            'supported for common inference criteria. ')
+            'supported for common inference criteria.')
         parser.add_argument(
             '--quantization-param-path',
             type=str,
@@ -187,58 +189,59 @@ def add_cli_args(
             'default to 1.0, which may cause accuracy issues. '
             'FP8_E5M2 (without scaling) is only supported on cuda version'
             'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
-            'supported for common inference criteria. ')
+            'supported for common inference criteria.')
         parser.add_argument('--max-model-len',
                             type=int,
                             default=EngineArgs.max_model_len,
-                            help='model context length. If unspecified, '
-                            'will be automatically derived from the model.')
+                            help='Model context length. If unspecified, will '
+                            'be automatically derived from the model config.')
         parser.add_argument(
             '--guided-decoding-backend',
             type=str,
             default='outlines',
             choices=['outlines', 'lm-format-enforcer'],
             help='Which engine will be used for guided decoding'
-            ' (JSON schema / regex etc)')
+            ' (JSON schema / regex etc).')
         # Parallel arguments
         parser.add_argument('--worker-use-ray',
                             action='store_true',
-                            help='use Ray for distributed serving, will be '
-                            'automatically set when using more than 1 GPU')
+                            help='Use Ray for distributed serving, will be '
+                            'automatically set when using more than 1 GPU.')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
                             default=EngineArgs.pipeline_parallel_size,
-                            help='number of pipeline stages')
+                            help='Number of pipeline stages.')
         parser.add_argument('--tensor-parallel-size',
                             '-tp',
                             type=int,
                             default=EngineArgs.tensor_parallel_size,
-                            help='number of tensor parallel replicas')
+                            help='Number of tensor parallel replicas.')
         parser.add_argument(
             '--max-parallel-loading-workers',
             type=int,
             default=EngineArgs.max_parallel_loading_workers,
-            help='load model sequentially in multiple batches, '
+            help='Load model sequentially in multiple batches, '
             'to avoid RAM OOM when using tensor '
-            'parallel and large models')
+            'parallel and large models.')
         parser.add_argument(
             '--ray-workers-use-nsight',
             action='store_true',
-            help='If specified, use nsight to profile ray workers')
+            help='If specified, use nsight to profile Ray workers.')
         # KV cache arguments
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
                             choices=[8, 16, 32, 128],
-                            help='token block size')
+                            help='Token block size for contiguous chunks of '
+                            'tokens.')
 
         parser.add_argument('--enable-prefix-caching',
                             action='store_true',
-                            help='Enables automatic prefix caching')
+                            help='Enables automatic prefix caching.')
         parser.add_argument('--use-v2-block-manager',
                             action='store_true',
-                            help='Use BlockSpaceMangerV2')
+                            help='Use BlockSpaceMangerV2.')
         parser.add_argument(
             '--num-lookahead-slots',
             type=int,
@@ -251,18 +254,19 @@ def add_cli_args(
         parser.add_argument('--seed',
                             type=int,
                             default=EngineArgs.seed,
-                            help='random seed')
+                            help='Random seed for operations.')
         parser.add_argument('--swap-space',
                             type=int,
                             default=EngineArgs.swap_space,
-                            help='CPU swap space size (GiB) per GPU')
+                            help='CPU swap space size (GiB) per GPU.')
         parser.add_argument(
             '--gpu-memory-utilization',
             type=float,
             default=EngineArgs.gpu_memory_utilization,
-            help='the fraction of GPU memory to be used for '
-            'the model executor, which can range from 0 to 1.'
-            'If unspecified, will use the default value of 0.9.')
+            help='The fraction of GPU memory to be used for the model '
+            'executor, which can range from 0 to 1. For example, a value of '
+            '0.5 would imply 50%% GPU memory utilization. If unspecified, '
+            'will use the default value of 0.9.')
         parser.add_argument(
             '--num-gpu-blocks-override',
             type=int,
@@ -272,21 +276,21 @@ def add_cli_args(
         parser.add_argument('--max-num-batched-tokens',
                             type=int,
                             default=EngineArgs.max_num_batched_tokens,
-                            help='maximum number of batched tokens per '
-                            'iteration')
+                            help='Maximum number of batched tokens per '
+                            'iteration.')
         parser.add_argument('--max-num-seqs',
                             type=int,
                             default=EngineArgs.max_num_seqs,
-                            help='maximum number of sequences per iteration')
+                            help='Maximum number of sequences per iteration.')
         parser.add_argument(
             '--max-logprobs',
             type=int,
             default=EngineArgs.max_logprobs,
-            help=('max number of log probs to return logprobs is specified in'
-                  ' SamplingParams'))
+            help=('Max number of log probs to return logprobs is specified in'
+                  ' SamplingParams.'))
         parser.add_argument('--disable-log-stats',
                             action='store_true',
-                            help='disable logging statistics')
+                            help='Disable logging statistics.')
         # Quantization settings.
         parser.add_argument('--quantization',
                             '-q',
@@ -318,13 +322,13 @@ def add_cli_args(
         parser.add_argument('--max-context-len-to-capture',
                             type=int,
                             default=EngineArgs.max_context_len_to_capture,
-                            help='maximum context length covered by CUDA '
+                            help='Maximum context length covered by CUDA '
                             'graphs. When a sequence has context length '
                             'larger than this, we fall back to eager mode.')
         parser.add_argument('--disable-custom-all-reduce',
                             action='store_true',
                             default=EngineArgs.disable_custom_all_reduce,
-                            help='See ParallelConfig')
+                            help='See ParallelConfig.')
         parser.add_argument('--tokenizer-pool-size',
                             type=int,
                             default=EngineArgs.tokenizer_pool_size,
@@ -417,7 +421,7 @@ def add_cli_args(
             '--enable-chunked-prefill',
             action='store_true',
             help='If set, the prefill requests can be chunked based on the '
-            'max_num_batched_tokens')
+            'max_num_batched_tokens.')
 
         parser.add_argument(
             '--speculative-model',
@@ -431,7 +435,7 @@ def add_cli_args(
             type=int,
             default=None,
             help='The number of speculative tokens to sample from '
-            'the draft model in speculative decoding')
+            'the draft model in speculative decoding.')
 
         parser.add_argument('--model-loader-extra-config',
                             type=str,
@@ -550,20 +554,31 @@ class AsyncEngineArgs(EngineArgs):
     max_log_len: Optional[int] = None
 
     @staticmethod
-    def add_cli_args(
-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
-        parser = EngineArgs.add_cli_args(parser)
+    def add_cli_args(parser: argparse.ArgumentParser,
+                     async_args_only: bool = False) -> argparse.ArgumentParser:
+        if not async_args_only:
+            parser = EngineArgs.add_cli_args(parser)
         parser.add_argument('--engine-use-ray',
                             action='store_true',
-                            help='use Ray to start the LLM engine in a '
+                            help='Use Ray to start the LLM engine in a '
                             'separate process as the server process.')
         parser.add_argument('--disable-log-requests',
                             action='store_true',
-                            help='disable logging requests')
+                            help='Disable logging requests.')
         parser.add_argument('--max-log-len',
                             type=int,
                             default=None,
-                            help='max number of prompt characters or prompt '
-                            'ID numbers being printed in log. '
-                            'Default: unlimited.')
+                            help='Max number of prompt characters or prompt '
+                            'ID numbers being printed in log.'
+                            '\n\nDefault: Unlimited')
         return parser
+
+
+# These functions are used by sphinx to build the documentation
+def _engine_args_parser():
+    return EngineArgs.add_cli_args(argparse.ArgumentParser())
+
+
+def _async_engine_args_parser():
+    return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
+                                        async_args_only=True)

From 38094349edcded4ee04605d79976ce41271d713e Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 19 Apr 2024 21:28:57 -0700
Subject: [PATCH 054/106] [Kernel][FP8] Initial support with dynamic per-tensor
 scaling (#4118)

Provide an initial support to FP8 computation. This PR is inspired by HuggingFace TGI: huggingface/text-generation-inference#1726

This feature can be enabled with --quantization fp8 or -q fp8 when launching an engine.

Algorithm:
We still load a model checkpoint in FP16/BF16. After the weights are loaded, Fp8LinearMethod calculates the per-tensor scaling factor of weights and quantizes the weights accordingly. The scaling factor will then be stored for future use. Meanwhile, the per-tensor scaling factor for activations is calculated in every forward pass.

Initial Results:
Currently tested Mistral-7B on 1xH100. With prompt length ~5 and decoding length 128:

BF16: 1.47s
FP8: 1.66s
I'll try to use larger models and try to find more performance bottleneck. Meanwhile, you're welcome to try this code.
---
 tests/quantization/test_fp8.py                |  24 +++
 vllm/entrypoints/llm.py                       |   9 +-
 vllm/model_executor/layers/linear.py          |   8 +
 .../layers/quantization/__init__.py           |   2 +
 .../model_executor/layers/quantization/fp8.py | 138 ++++++++++++++++++
 vllm/model_executor/model_loader/loader.py    |   4 +
 .../model_loader/weight_utils.py              |   9 +-
 7 files changed, 189 insertions(+), 5 deletions(-)
 create mode 100644 tests/quantization/test_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/fp8.py

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
new file mode 100644
index 0000000000000..fa10e60de10a7
--- /dev/null
+++ b/tests/quantization/test_fp8.py
@@ -0,0 +1,24 @@
+"""Tests whether FP8 computation is enabled correctly.
+
+Run `pytest tests/quantization/test_fp8.py --forked`.
+"""
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+    reason="FP8 is not supported on this GPU type.")
+def test_load_fp16_model(vllm_runner) -> None:
+    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+
+    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+    fc1 = model.model.decoder.layers[0].fc1
+    assert isinstance(fc1.linear_method, Fp8LinearMethod)
+    assert fc1.weight.dtype == torch.float8_e4m3fn
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 5a8cd046df09c..ae028200a98de 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -42,10 +42,11 @@ class LLM:
             However, if the `torch_dtype` in the config is `float32`, we will
             use `float16` instead.
         quantization: The method used to quantize the model weights. Currently,
-            we support "awq", "gptq" and "squeezellm". If None, we first check
-            the `quantization_config` attribute in the model config file. If
-            that is None, we assume the model weights are not quantized and use
-            `dtype` to determine the data type of the weights.
+            we support "awq", "gptq", "squeezellm", and "fp8" (experimental).
+            If None, we first check the `quantization_config` attribute in the
+            model config file. If that is None, we assume the model weights are
+            not quantized and use `dtype` to determine the data type of
+            the weights.
         sparsity: The format of the sparse model weights. Currently,
             we support "sparse_w16a16". If None, we first check the `sparsity` 
             attribute in the model config file. If that is None, we assume the 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c1590a1cdabd5..a0136c6b31c92 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.nn.functional as F
+from torch import nn
 from torch.nn.parameter import Parameter
 
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
@@ -50,6 +51,13 @@ def apply_weights(self,
         Expects create_weights to have been called before on the layer."""
         raise NotImplementedError
 
+    def process_weights_after_loading(self, layer: nn.Module) -> None:
+        """Process the weight after loading.
+
+        This can be used for example, to transpose weights for computation.
+        """
+        return
+
 
 class UnquantizedLinearMethod(LinearMethodBase):
     """Linear method without quantization.
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index a3b89a66469eb..0344d6e4e3e45 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -3,12 +3,14 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.fp8 import FP8Config
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 
 QUANTIZATION_METHODS = {
     "awq": AWQConfig,
+    "fp8": FP8Config,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
     "marlin": MarlinConfig,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
new file mode 100644
index 0000000000000..9dc0e86e1243d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -0,0 +1,138 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+class FP8Config(QuantizationConfig):
+    """Config class for FP8."""
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # TODO: PyTorch 2.3.0+ is required to run FP8 on
+        # SM 89 (e.g. Ada) GPUs. Specifically, this PR has to
+        # be included: https://github.com/pytorch/pytorch/pull/118881
+        return 90
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
+        return cls()
+
+    def get_linear_method(self) -> "Fp8LinearMethod":
+        return Fp8LinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class Fp8LinearMethod(LinearMethodBase):
+    """Linear method for FP8.
+    We now support common FP16/BF16 model checkpoints ONLY. The weight
+    scaling factor will be initialized after the model weights are loaded.
+
+    Limitations:
+    1. Only support per-tensor quantization due to torch._scaled_mm support.
+    2. Only support float8_e4m3fn data type due to the limitation of
+       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
+       
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: FP8Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight = Parameter(torch.empty(output_size_per_partition,
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        set_weight_attrs(weight, extra_weight_attrs)
+
+        w_scale = Parameter(
+            torch.empty(1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("weight_scaling_factor", w_scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Although the linear_method is propagated to all layers,
+        # only linear layers invoke "create_weights". So we check
+        # whether "weight_scaling_facor" is registered to determine
+        # whether the layer is a linear layer that requires quantization.
+        if not hasattr(layer, "weight_scaling_factor"):
+            return
+
+        qweight, weight_scale = per_tensor_quantize(layer.weight)
+        # torch._scaled_mm requires column-major in the second
+        # input (weight), so we transpose the quantized weight.
+        layer.weight = Parameter(qweight.t(), requires_grad=False)
+        layer.weight_scaling_factor.data.copy_(weight_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        qinput, x_scale = per_tensor_quantize(x)
+        output, _ = torch._scaled_mm(
+            qinput,
+            layer.weight,
+            out_dtype=x.dtype,
+            scale_a=x_scale,
+            scale_b=layer.weight_scaling_factor,
+            bias=bias,
+        )
+        return output
+
+
+def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
+    """Quantize a tensor using per-tensor static scaling factor.
+
+    Args:
+        tensor: The input tensor.
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate the scale as dtype max divided by absmax.
+    # Since .abs() creates a new tensor, we use aminmax to get
+    # the min and max first and then calculate the absmax.
+    min_val, max_val = tensor.aminmax()
+    amax = min_val.abs().max(max_val.abs())
+    scale = finfo.max / amax.clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(torch.float8_e4m3fn)
+    scale = scale.float().reciprocal()
+    return qweight, scale
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 3b1d125ef8a67..6c8cb2935f37e 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -228,6 +228,10 @@ def load_model(self, *, model_config: ModelConfig,
                                                model,
                                                "fall_back_to_pt_during_load",
                                                True)), )
+            for _, module in model.named_modules():
+                linear_method = getattr(module, "linear_method", None)
+                if linear_method is not None:
+                    linear_method.process_weights_after_loading(module)
         return model.eval()
 
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index de534a1d0e3c4..46f07205aceab 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -149,11 +149,18 @@ def get_quant_config(model_config: ModelConfig,
                                           tqdm_class=DisabledTqdm)
     else:
         hf_folder = model_name_or_path
+
+    possible_config_filenames = quant_cls.get_config_filenames()
+
+    # If the quantization config is not found, use the default config.
+    if not possible_config_filenames:
+        return quant_cls()
+
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
 
     quant_config_files = [
         f for f in config_files if any(
-            f.endswith(x) for x in quant_cls.get_config_filenames())
+            f.endswith(x) for x in possible_config_filenames)
     ]
     if len(quant_config_files) == 0:
         raise ValueError(

From 41f9c9bd1a1dcaef4025ea714d8d010e15251d1f Mon Sep 17 00:00:00 2001
From: nunjunj <106306814+nunjunj@users.noreply.github.com>
Date: Sat, 20 Apr 2024 00:11:57 -0700
Subject: [PATCH 055/106] [Frontend] multiple sampling params support  (#3570)

---
 tests/entrypoints/test_llm_generate.py | 41 ++++++++++++++++++++++++++
 vllm/entrypoints/llm.py                | 30 ++++++++++++-------
 2 files changed, 61 insertions(+), 10 deletions(-)
 create mode 100644 tests/entrypoints/test_llm_generate.py

diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py
new file mode 100644
index 0000000000000..5e8b7ca4d9977
--- /dev/null
+++ b/tests/entrypoints/test_llm_generate.py
@@ -0,0 +1,41 @@
+import pytest
+
+from vllm import LLM, SamplingParams
+
+
+def test_multiple_sampling_params():
+
+    llm = LLM(model="facebook/opt-125m",
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1)
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    sampling_params = [
+        SamplingParams(temperature=0.01, top_p=0.95),
+        SamplingParams(temperature=0.3, top_p=0.95),
+        SamplingParams(temperature=0.7, top_p=0.95),
+        SamplingParams(temperature=0.99, top_p=0.95),
+    ]
+
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+    assert len(prompts) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(prompts, sampling_params=sampling_params[:3])
+
+    # Single SamplingParams should be applied to every prompt
+    single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
+    outputs = llm.generate(prompts, sampling_params=single_sampling_params)
+    assert len(prompts) == len(outputs)
+
+    # sampling_params is None, default params should be applied
+    outputs = llm.generate(prompts, sampling_params=None)
+    assert len(prompts) == len(outputs)
\ No newline at end of file
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index ae028200a98de..4d114665df34b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -136,7 +136,8 @@ def set_tokenizer(
     def generate(
         self,
         prompts: Optional[Union[str, List[str]]] = None,
-        sampling_params: Optional[SamplingParams] = None,
+        sampling_params: Optional[Union[SamplingParams,
+                                        List[SamplingParams]]] = None,
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
@@ -151,7 +152,10 @@ def generate(
         Args:
             prompts: A list of prompts to generate completions for.
             sampling_params: The sampling parameters for text generation. If
-                None, we use the default sampling parameters.
+                None, we use the default sampling parameters. 
+                When it is a single value, it is applied to every prompt. 
+                When it is a list, the list must have the same length as the 
+                prompts and it is paired one by one with the prompt.
             prompt_token_ids: A list of token IDs for the prompts. If None, we
                 use the tokenizer to convert the prompts to token IDs.
             use_tqdm: Whether to use tqdm to display the progress bar.
@@ -172,27 +176,33 @@ def generate(
                 and len(prompts) != len(prompt_token_ids)):
             raise ValueError("The lengths of prompts and prompt_token_ids "
                              "must be the same.")
+
+        if prompts is not None:
+            num_requests = len(prompts)
+        else:
+            assert prompt_token_ids is not None
+            num_requests = len(prompt_token_ids)
+
         if sampling_params is None:
             # Use default sampling params.
             sampling_params = SamplingParams()
 
+        elif isinstance(sampling_params,
+                        list) and len(sampling_params) != num_requests:
+            raise ValueError("The lengths of prompts and sampling_params "
+                             "must be the same.")
         if multi_modal_data:
             multi_modal_data.data = multi_modal_data.data.to(torch.float16)
 
         # Add requests to the engine.
-        if prompts is not None:
-            num_requests = len(prompts)
-        else:
-            assert prompt_token_ids is not None
-            num_requests = len(prompt_token_ids)
-
         for i in range(num_requests):
             prompt = prompts[i] if prompts is not None else None
             token_ids = None if prompt_token_ids is None else prompt_token_ids[
                 i]
             self._add_request(
                 prompt,
-                sampling_params,
+                sampling_params[i]
+                if isinstance(sampling_params, list) else sampling_params,
                 token_ids,
                 lora_request=lora_request,
                 # Get ith image while maintaining the batch dim.
@@ -241,4 +251,4 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
         # This is necessary because some requests may be finished earlier than
         # its previous requests.
         outputs = sorted(outputs, key=lambda x: int(x.request_id))
-        return outputs
+        return outputs
\ No newline at end of file

From 59e717bf49dfb8e60872436f7f6ebc3981f1eb2c Mon Sep 17 00:00:00 2001
From: Noam Gat <noamgat@gmail.com>
Date: Sat, 20 Apr 2024 11:33:16 +0300
Subject: [PATCH 056/106] Updating lm-format-enforcer version and adding links
 to decoding libraries in docs (#4222)

---
 requirements-common.txt                        | 2 +-
 vllm/engine/arg_utils.py                       | 6 +++++-
 vllm/model_executor/layers/quantization/fp8.py | 4 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 3de0f98e7c0c3..3cc7bba8f84db 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -12,7 +12,7 @@ uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 tiktoken == 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.9.3
+lm-format-enforcer == 0.9.8
 outlines == 0.0.34 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0d00a695679a1..fafaf9c70bdb0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -201,7 +201,11 @@ def add_cli_args(
             default='outlines',
             choices=['outlines', 'lm-format-enforcer'],
             help='Which engine will be used for guided decoding'
-            ' (JSON schema / regex etc).')
+            ' (JSON schema / regex etc) by default. Currently support '
+            'https://github.com/outlines-dev/outlines and '
+            'https://github.com/noamgat/lm-format-enforcer.'
+            ' Can be overridden per request via guided_decoding_backend'
+            ' parameter.')
         # Parallel arguments
         parser.add_argument('--worker-use-ray',
                             action='store_true',
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 9dc0e86e1243d..8df82e0e18edd 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 from torch.nn import Module
@@ -114,7 +114,7 @@ def apply_weights(self,
         return output
 
 
-def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
+def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
     """Quantize a tensor using per-tensor static scaling factor.
 
     Args:

From e9163744b6266c531b76a87159e02c7f36342a9c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 21 Apr 2024 17:15:28 +0100
Subject: [PATCH 057/106] Don't show default value for flags in `EngineArgs`
 (#4223)

Co-authored-by: Harry Mellor <hmellor@oxts.com>
---
 docs/source/models/engine_args.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index 92bc7e0e843ed..bdf566d3ebbd1 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -9,6 +9,7 @@ Below, you can find an explanation of every engine argument for vLLM:
     :module: vllm.engine.arg_utils
     :func: _engine_args_parser
     :prog: -m vllm.entrypoints.openai.api_server
+    :nodefaultconst:
 
 Async Engine Arguments
 ----------------------
@@ -18,4 +19,5 @@ Below are the additional arguments related to the asynchronous engine:
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _async_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
\ No newline at end of file
+    :prog: -m vllm.entrypoints.openai.api_server
+    :nodefaultconst:
\ No newline at end of file

From a8a2ad675c204f4f52d48a07374cf8f814e850ca Mon Sep 17 00:00:00 2001
From: xiaoji <44150358+YeFD@users.noreply.github.com>
Date: Mon, 22 Apr 2024 00:57:08 +0800
Subject: [PATCH 058/106] [Doc]: Update the doc of adding new models (#4236)

---
 docs/source/models/adding_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index a82c2cef10e83..cbc8099e6f70f 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -95,7 +95,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 5. Register your model
 ----------------------
 
-Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/model_loader.py>`_.
+Finally, register your :code:`*ForCausalLM` class to the :code:`_MODELS` in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_.
 
 6. Out-of-Tree Model Integration
 --------------------------------------------

From f17bb415f9792e5a0255c167fe040133a0c57e49 Mon Sep 17 00:00:00 2001
From: GeauxEric <yunding.eric@gmail.com>
Date: Sun, 21 Apr 2024 15:06:46 -0700
Subject: [PATCH 059/106] Make initialization of tokenizer and detokenizer
 optional (#3748)

Co-authored-by: Yun Ding <yunding@nvidia.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 tests/engine/test_skip_tokenizer_init.py    | 23 ++++++++++++++++
 vllm/config.py                              |  7 ++++-
 vllm/engine/arg_utils.py                    |  7 ++++-
 vllm/engine/llm_engine.py                   | 29 +++++++++++++++------
 vllm/engine/output_processor/single_step.py |  5 ++--
 vllm/entrypoints/llm.py                     |  9 +++++++
 6 files changed, 68 insertions(+), 12 deletions(-)
 create mode 100644 tests/engine/test_skip_tokenizer_init.py

diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
new file mode 100644
index 0000000000000..baa463a316902
--- /dev/null
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -0,0 +1,23 @@
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_skip_tokenizer_initialization(model: str):
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(model=model, skip_tokenizer_init=True)
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+    with pytest.raises(ValueError) as err:
+        llm.generate("abc", sampling_params)
+    assert "prompts must be None if" in str(err.value)
+    outputs = llm.generate(prompt_token_ids=[[1, 2, 3]],
+                           sampling_params=sampling_params)
+    assert len(outputs) > 0
+    completions = outputs[0].outputs
+    assert len(completions) > 0
+    assert completions[0].text == ""
+    assert completions[0].token_ids
diff --git a/vllm/config.py b/vllm/config.py
index 708fdf3d1c487..33840f6f1c036 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -66,6 +66,8 @@ class ModelConfig:
         max_context_len_to_capture: Maximum context len covered by CUDA graphs.
             When a sequence has context length larger than this, we fall back
             to eager mode.
+        skip_tokenizer_init: If true, skip initialization of tokenizer and
+            detokenizer.
     """
 
     def __init__(
@@ -87,6 +89,7 @@ def __init__(
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
         max_logprobs: int = 5,
+        skip_tokenizer_init: bool = False,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -103,6 +106,7 @@ def __init__(
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
         self.max_logprobs = max_logprobs
+        self.skip_tokenizer_init = skip_tokenizer_init
 
         self.hf_config = get_config(self.model, trust_remote_code, revision,
                                     code_revision)
@@ -110,7 +114,8 @@ def __init__(
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
                                                      max_model_len)
-        self._verify_tokenizer_mode()
+        if not self.skip_tokenizer_init:
+            self._verify_tokenizer_mode()
         self._verify_quantization()
         # UPSTREAM SYNC: keep sparsity
         self._verify_sparsity()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index fafaf9c70bdb0..5d3d73c0dd60e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -18,6 +18,7 @@ class EngineArgs:
     """Arguments for vLLM engine."""
     model: str
     tokenizer: Optional[str] = None
+    skip_tokenizer_init: bool = False
     tokenizer_mode: str = 'auto'
     trust_remote_code: bool = False
     download_dir: Optional[str] = None
@@ -97,6 +98,10 @@ def add_cli_args(
             type=str,
             default=EngineArgs.tokenizer,
             help='Name or path of the huggingface tokenizer to use.')
+        parser.add_argument(
+            '--skip-tokenizer-init',
+            action='store_true',
+            help='Skip initialization of tokenizer and detokenizer')
         parser.add_argument(
             '--revision',
             type=str,
@@ -469,7 +474,7 @@ def create_engine_config(self, ) -> EngineConfig:
             # UPSTREAM SYNC: keep sparsity argument
             self.quantization, self.quantization_param_path, self.sparsity,
             self.enforce_eager,self.max_context_len_to_capture,
-            self.max_logprobs)
+            self.max_logprobs, self.skip_tokenizer_init)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a7537c8761481..dcb644b34c234 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -100,6 +100,7 @@ def __init__(
             f"model={model_config.model!r}, "
             f"speculative_config={speculative_config!r}, "
             f"tokenizer={model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={model_config.skip_tokenizer_init}, "
             f"tokenizer_mode={model_config.tokenizer_mode}, "
             f"revision={model_config.revision}, "
             f"tokenizer_revision={model_config.tokenizer_revision}, "
@@ -134,8 +135,14 @@ def __init__(
         self.decoding_config = decoding_config or DecodingConfig()
         self.log_stats = log_stats
 
-        self._init_tokenizer()
-        self.detokenizer = Detokenizer(self.tokenizer)
+        if not self.model_config.skip_tokenizer_init:
+            self.tokenizer: BaseTokenizerGroup
+            self._init_tokenizer()
+            self.detokenizer = Detokenizer(self.tokenizer)
+        else:
+            self.detokenizer = None
+            self.tokenizer = None
+
         self.seq_counter = Counter()
         self.generation_config_fields = _load_generation_config_dict(
             model_config)
@@ -189,9 +196,10 @@ def __init__(
                     parallel_config.disable_custom_all_reduce,
                 })
 
-        # Ping the tokenizer to ensure liveness if it runs in a
-        # different process.
-        self.tokenizer.ping()
+        if self.tokenizer:
+            # Ping the tokenizer to ensure liveness if it runs in a
+            # different process.
+            self.tokenizer.ping()
 
         # Create the scheduler.
         # NOTE: the cache_config here have been updated with the numbers of
@@ -298,7 +306,7 @@ def _init_tokenizer(self, **tokenizer_init_kwargs):
             trust_remote_code=self.model_config.trust_remote_code,
             revision=self.model_config.tokenizer_revision)
         init_kwargs.update(tokenizer_init_kwargs)
-        self.tokenizer: BaseTokenizerGroup = get_tokenizer_group(
+        self.tokenizer = get_tokenizer_group(
             self.parallel_config.tokenizer_pool_config, **init_kwargs)
 
     def _verify_args(self) -> None:
@@ -395,8 +403,13 @@ def add_request(
         # Create the sequences.
         block_size = self.cache_config.block_size
         seq_id = next(self.seq_counter)
-        eos_token_id = self.tokenizer.get_lora_tokenizer(
-            lora_request).eos_token_id
+        eos_token_id = None
+        if self.tokenizer:
+            eos_token_id = self.tokenizer.get_lora_tokenizer(
+                lora_request).eos_token_id
+        else:
+            logger.warning("Use None for EOS token id because tokenizer is "
+                           "not initialized")
         seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
                        eos_token_id, lora_request)
 
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 1b7eb014f802b..b32937327ba7f 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -59,7 +59,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
 
         # Process prompt logprobs
         prompt_logprobs = outputs.prompt_logprobs
-        if prompt_logprobs is not None and seq_group.sampling_params.detokenize:
+        if prompt_logprobs is not None and \
+            seq_group.sampling_params.detokenize and self.detokenizer:
             self.detokenizer.decode_prompt_logprobs_inplace(
                 seq_group, prompt_logprobs)
             seq_group.prompt_logprobs = prompt_logprobs
@@ -105,7 +106,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             child_seqs.append((parent, parent))
 
         for seq, _ in child_seqs:
-            if seq_group.sampling_params.detokenize:
+            if seq_group.sampling_params.detokenize and self.detokenizer:
                 new_char_count = self.detokenizer.decode_sequence_inplace(
                     seq, seq_group.sampling_params)
             else:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 4d114665df34b..8ae1cda8ccda7 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -32,6 +32,9 @@ class LLM:
         tokenizer: The name or path of a HuggingFace Transformers tokenizer.
         tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
             if available, and "slow" will always use the slow tokenizer.
+        skip_tokenizer_init: If true, skip initialization of tokenizer and
+            detokenizer. Expect valid prompt_token_ids and None for prompt
+            from the input.
         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
             downloading the model and tokenizer.
         tensor_parallel_size: The number of GPUs to use for distributed
@@ -81,6 +84,7 @@ def __init__(
         model: str,
         tokenizer: Optional[str] = None,
         tokenizer_mode: str = "auto",
+        skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
         tensor_parallel_size: int = 1,
         dtype: str = "auto",
@@ -103,6 +107,7 @@ def __init__(
             model=model,
             tokenizer=tokenizer,
             tokenizer_mode=tokenizer_mode,
+            skip_tokenizer_init=skip_tokenizer_init,
             trust_remote_code=trust_remote_code,
             tensor_parallel_size=tensor_parallel_size,
             dtype=dtype,
@@ -169,6 +174,10 @@ def generate(
         if prompts is None and prompt_token_ids is None:
             raise ValueError("Either prompts or prompt_token_ids must be "
                              "provided.")
+        if self.llm_engine.model_config.skip_tokenizer_init \
+            and prompts is not None:
+            raise ValueError("prompts must be None if skip_tokenizer_init "
+                             "is True")
         if isinstance(prompts, str):
             # Convert a single prompt to a list.
             prompts = [prompts]

From 0a74885bbf7b60a048912843dbaa4f4f957aedcd Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Mon, 22 Apr 2024 00:57:24 -0400
Subject: [PATCH 060/106] [AMD][Hardware][Misc][Bugfix] xformer cleanup and
 light navi logic and CI fixes and refactoring (#4129)

---
 .buildkite/test-pipeline.yaml                 |   2 -
 Dockerfile.rocm                               |   5 +-
 patch_xformers.rocm.sh                        |  33 ----
 .../commonpy_xformers-0.0.23.rocm.patch       |  13 --
 rocm_patch/flashpy_xformers-0.0.23.rocm.patch | 152 ------------------
 vllm/attention/backends/rocm_flash_attn.py    |  31 ++--
 6 files changed, 19 insertions(+), 217 deletions(-)
 delete mode 100644 patch_xformers.rocm.sh
 delete mode 100644 rocm_patch/commonpy_xformers-0.0.23.rocm.patch
 delete mode 100644 rocm_patch/flashpy_xformers-0.0.23.rocm.patch

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0f920c7ec1442..f7c1569696249 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -15,10 +15,8 @@ steps:
   commands:
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test
   command: pytest -v -s core
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index b1c5fac9d78ef..3f84b949481d1 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -14,7 +14,7 @@ RUN echo "Base image is $BASE_IMAGE"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
 
-ARG FA_BRANCH="3d2b6f5"
+ARG FA_BRANCH="ae7928c"
 RUN echo "FA_BRANCH is $FA_BRANCH"
 
 # whether to build flash-attention
@@ -92,13 +92,10 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip numba
-RUN python3 -m pip install xformers==0.0.23 --no-deps
 
 RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
-    && if [ "$BUILD_FA" = "1" ]; then \
-       bash patch_xformers.rocm.sh; fi \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
     && cd ..
diff --git a/patch_xformers.rocm.sh b/patch_xformers.rocm.sh
deleted file mode 100644
index de427b24d306f..0000000000000
--- a/patch_xformers.rocm.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-set -e
-
-XFORMERS_VERSION="0.0.23"
-
-export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)')
-
-if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then
-    echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed"
-    exit 1
-fi
-
-export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
-export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
-
-echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}"
-echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}"
-
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
-    echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
-    patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"
-    echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
-else
-    echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
-fi
-
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
-    echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
-    patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"
-    echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
-else
-    echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
-fi
diff --git a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
deleted file mode 100644
index 4d7495cf13e1d..0000000000000
--- a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
+++ /dev/null
@@ -1,13 +0,0 @@
---- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py	2023-11-29 03:17:03.930103539 +0000
-+++ common.py	2023-11-28 16:14:19.846233146 +0000
-@@ -298,8 +298,8 @@
-         dtype = d.query.dtype
-         if device_type not in cls.SUPPORTED_DEVICES:
-             reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
--        if device_type == "cuda" and not _built_with_cuda:
--            reasons.append("xFormers wasn't build with CUDA support")
-+        #if device_type == "cuda" and not _built_with_cuda:
-+        #    reasons.append("xFormers wasn't build with CUDA support")
-         if device_type == "cuda":
-             device_capability = torch.cuda.get_device_capability(d.device)
-             if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
diff --git a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
deleted file mode 100644
index ac846728a7a91..0000000000000
--- a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
+++ /dev/null
@@ -1,152 +0,0 @@
---- flash_ori.py	2023-12-13 05:43:31.530752623 +0000
-+++ flash_patch.py	2023-12-13 06:00:45.962403104 +0000
-@@ -36,44 +36,44 @@
- 
- FLASH_VERSION = "0.0.0"
- try:
--    try:
--        from ... import _C_flashattention  # type: ignore[attr-defined]
--        from ..._cpp_lib import _build_metadata
--
--        if _build_metadata is not None:
--            FLASH_VERSION = _build_metadata.flash_version
--    except ImportError:
--        import flash_attn
--        from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
--
--        FLASH_VERSION = flash_attn.__version__
--        flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
--        if (
--            flash_ver_parsed != (2, 3, 6)
--            and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
--        ):
--            raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
-+    #try:
-+    #    from ... import _C_flashattention  # type: ignore[attr-defined]
-+    #    from ..._cpp_lib import _build_metadata
-+
-+    #    if _build_metadata is not None:
-+    #        FLASH_VERSION = _build_metadata.flash_version
-+    #except ImportError:
-+    import flash_attn
-+    from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
-+
-+    FLASH_VERSION = flash_attn.__version__
-+    #    flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
-+    #    if (
-+    #        flash_ver_parsed != (2, 3, 6)
-+    #        and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
-+    #    ):
-+    #        raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
- 
-     # create library so that flash-attn goes through the PyTorch Dispatcher
--    _flash_lib = torch.library.Library("xformers_flash", "DEF")
--
--    _flash_lib.define(
--        "flash_fwd(Tensor query, Tensor key, Tensor value, "
--        "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
--        "int max_seqlen_q, int max_seqlen_k, "
--        "float p, float softmax_scale, "
--        "bool is_causal, int window_left, "
--        "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
--    )
-+    #_flash_lib = torch.library.Library("xformers_flash", "DEF")
- 
--    _flash_lib.define(
--        "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
--        "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
--        "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
--        "int max_seqlen_q, int max_seqlen_k, "
--        "float p, float softmax_scale, bool is_causal, "
--        "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
--    )
-+    #_flash_lib.define(
-+    #    "flash_fwd(Tensor query, Tensor key, Tensor value, "
-+    #    "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
-+    #    "int max_seqlen_q, int max_seqlen_k, "
-+    #    "float p, float softmax_scale, "
-+    #    "bool is_causal, int window_left, "
-+    #    "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
-+    #)
-+
-+    #_flash_lib.define(
-+    #    "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
-+    #    "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
-+    #    "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
-+    #    "int max_seqlen_q, int max_seqlen_k, "
-+    #    "float p, float softmax_scale, bool is_causal, "
-+    #    "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
-+    #)
- 
-     def _flash_fwd(
-         query,
-@@ -111,8 +111,8 @@
-                 p,
-                 softmax_scale,
-                 is_causal,
--                window_left,  # window_size_left
--                window_right,  # window_size_right
-+        #        window_left,  # window_size_left
-+        #        window_right,  # window_size_right
-                 return_softmax,
-                 None,  # rng
-             )
-@@ -134,15 +134,15 @@
-                 out,
-                 cu_seq_lens_q,
-                 cu_seq_lens_k,
--                seqused_k,
-+         #       seqused_k,
-                 max_seq_len_q,
-                 max_seq_len_k,
-                 p,
-                 softmax_scale,
-                 False,
-                 is_causal,
--                window_left,
--                window_right,
-+         #       window_left,
-+         #       window_right,
-                 return_softmax,
-                 None,
-             )
-@@ -184,8 +184,8 @@
-                 p,
-                 softmax_scale,
-                 is_causal,
--                window_left,
--                window_right,
-+        #        window_left,
-+        #        window_right,
-                 None,
-                 rng_state,
-             )
-@@ -208,15 +208,15 @@
-                 softmax_scale,
-                 False,  # zero_tensors
-                 is_causal,
--                window_left,
--                window_right,
-+        #        window_left,
-+        #        window_right,
-                 None,
-                 rng_state,
-             )
-         return dq, dk, dv
- 
--    _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
--    _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
-+    #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
-+    #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
- except ImportError:
-     pass
- 
-@@ -400,7 +400,7 @@
-         implementation.
-     """
- 
--    OPERATOR = get_operator("xformers_flash", "flash_fwd")
-+    OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd")
-     SUPPORTED_DEVICES: Set[str] = {"cuda"}
-     CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
-     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index c42660fb8f74f..dbaa71fd16add 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -154,25 +154,30 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
-        self.use_naive_attn = torch.cuda.get_device_capability()[0] != 9
+        self.use_naive_attn = False
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
         self.use_triton_flash_attn = (os.environ.get(
             "VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1"))
-        if self.use_naive_attn:
-            # AMD Radeon 7900 series (gfx1100) currently does not support
-            # xFormers nor FlashAttention. As a temporary workaround, we use
-            # naive PyTorch implementation of attention.
-            self.attn_fuc = _naive_attention
-            logger.debug("Using naive attention in ROCmBackend")
-        elif self.use_triton_flash_attn:
+        if self.use_triton_flash_attn:
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
             self.attn_func = triton_attention
             logger.debug("Using Triton FA in ROCmBackend")
         else:
-            from flash_attn import flash_attn_varlen_func  # noqa: F401
-            self.attn_func = flash_attn_varlen_func
-            logger.debug("Using CK FA in ROCmBackend")
+            # if not using triton, navi3x not use flash-attn either
+            if torch.cuda.get_device_capability()[0] == 11:
+                self.use_naive_attn = True
+            else:
+                try:
+                    from flash_attn import flash_attn_varlen_func  # noqa: F401
+                    self.attn_func = flash_attn_varlen_func
+                    logger.debug("Using CK FA in ROCmBackend")
+                except ModuleNotFoundError:
+                    self.use_naive_attn = True
+
+            if self.use_naive_attn:
+                self.attn_func = _naive_attention
+                logger.debug("Using naive attention in ROCmBackend")
 
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
@@ -247,13 +252,13 @@ def forward(
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
-                if self.use_naive_attn or self.use_triton_flash_attn:
+                if self.use_triton_flash_attn or self.use_naive_attn:
                     if self.num_kv_heads != self.num_heads:
                         # Interleave for MQA workaround.
                         key = self.repeat_kv(key, self.num_queries_per_kv)
                         value = self.repeat_kv(value, self.num_queries_per_kv)
                     if self.use_naive_attn:
-                        out = self.attn_fuc(
+                        out = self.attn_func(
                             query,
                             key,
                             value,

From 3ca56cf94c85c396f29a842479757197855f98d2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 21 Apr 2024 23:04:16 -0700
Subject: [PATCH 061/106] [Core][Distributed] fix _is_full_nvlink detection
 (#4233)

---
 .../device_communicators/custom_all_reduce.py | 48 ++++++++++++-------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 7602897d3dd8f..58cbe77baf7e0 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,5 +1,6 @@
+import os
 from contextlib import contextmanager
-from typing import Optional
+from typing import List, Optional
 
 import torch
 import torch.distributed as dist
@@ -53,14 +54,20 @@ def init_custom_ar() -> None:
         return False
     # test nvlink first, this will filter out most of the cases
     # where custom allreduce is not supported
-    full_nvlink = _is_full_nvlink(rank, world_size)
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = list(
+            map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
+    else:
+        device_ids = list(range(num_dev))
+    # this checks hardware and driver support for NVLink
+    full_nvlink = _is_full_nvlink(device_ids)
     if world_size > 2 and not full_nvlink:
         logger.warn(
             "Custom allreduce is disabled because it's not supported on more"
             " than two PCIe-only GPUs. To silence this warning, specify"
             " disable_custom_all_reduce=True explicitly.")
         return
-    # test P2P capability
+    # test P2P capability, this checks software/cudaruntime support
     # this is expensive to compute at the first time
     # then we cache the result
     if not _can_p2p(rank, world_size):
@@ -138,23 +145,28 @@ def _nvml():
         pynvml.nvmlShutdown()
 
 
-# query if the set of gpus are fully connected by nvlink (1 hop)
 @_nvml()
-def _is_full_nvlink(rank, world_size):
-    handle = pynvml.nvmlDeviceGetHandleByIndex(rank)
-    for i in range(world_size):
-        if i != rank:
-            try:
-                peer_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                    handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
-                if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+def _is_full_nvlink(device_ids: List[int]) -> bool:
+    """
+    query if the set of gpus are fully connected by nvlink (1 hop)
+    Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`,
+    so it works on real physical device ids.
+    """
+    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
+    for i, handle in enumerate(handles):
+        for j, peer_handle in enumerate(handles):
+            if i < j:
+                try:
+                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                        return False
+                except pynvml.NVMLError as error:
+                    logger.error(
+                        "NVLink detection failed. This is normal if your"
+                        " machine has no NVLink equipped.",
+                        exc_info=error)
                     return False
-            except pynvml.NVMLError as error:
-                logger.info(
-                    f"NVLink detection failed with message \"{str(error)}\". "
-                    "This is normal if your machine has no NVLink equipped")
-                return False
     return True
 
 
From 1f7027e5602b87918a05efc5336581a66d2cee44 Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Mon, 22 Apr 2024 15:44:16 +0800
Subject: [PATCH 062/106] [Misc] Add vision language model support to CPU
 backend (#3968)

---
 vllm/executor/cpu_executor.py   |  1 +
 vllm/worker/cpu_model_runner.py | 60 ++++++++++++++++++++-------------
 vllm/worker/cpu_worker.py       | 24 ++++++++-----
 3 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index f925a6fc93dcd..35249cd7302cb 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -45,6 +45,7 @@ def _init_worker(self):
             rank=0,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 7377c8931cefa..a82373d3d1626 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -5,7 +5,7 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
@@ -29,6 +29,7 @@ def __init__(
         device_config: DeviceConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
         *args,
@@ -38,6 +39,7 @@ def __init__(
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.lora_config = lora_config
+        self.vision_language_config = vision_language_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
@@ -59,13 +61,14 @@ def __init__(
         self.block_size: int  # Set after initial profiling.
 
     def load_model(self) -> None:
-        self.model = get_model(model_config=self.model_config,
-                               load_config=self.load_config,
-                               device_config=self.device_config,
-                               vision_language_config=None,
-                               lora_config=self.lora_config,
-                               parallel_config=self.parallel_config,
-                               scheduler_config=self.scheduler_config)
+        self.model = get_model(
+            model_config=self.model_config,
+            load_config=self.load_config,
+            device_config=self.device_config,
+            vision_language_config=self.vision_language_config,
+            lora_config=self.lora_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config)
 
     def _prepare_prompt(
         self,
@@ -76,6 +79,7 @@ def _prepare_prompt(
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         prompt_lens: List[int] = []
+        multi_modal_input_list: List[torch.Tensor] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -96,6 +100,10 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.extend(list(range(computed_len, prompt_len)))
 
+            if seq_group_metadata.multi_modal_data:
+                multi_modal_input_list.append(
+                    seq_group_metadata.multi_modal_data.data)
+
             # Compute the slot mapping.
             block_table = seq_group_metadata.block_tables[seq_id]
             # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
@@ -118,6 +126,15 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
+        if multi_modal_input_list:
+            assert self.vision_language_config, (
+                "Multi-modal inputs are only supported by "
+                "vision language models.")
+            multi_modal_input = torch.cat(multi_modal_input_list,
+                                          dim=0).to(self.device)
+        else:
+            multi_modal_input = None
+
         num_prompt_tokens = len(input_tokens)
 
         input_tokens = torch.tensor(input_tokens,
@@ -144,12 +161,8 @@ def _prepare_prompt(
             slot_mapping=slot_mapping,
             kv_cache_dtype=self.kv_cache_dtype,
         )
-        return (
-            input_tokens,
-            input_positions,
-            attn_metadata,
-            prompt_lens,
-        )
+        return (input_tokens, input_positions, attn_metadata, prompt_lens,
+                multi_modal_input)
 
     def _prepare_decode(
         self,
@@ -336,14 +349,16 @@ def prepare_input_tensors(
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata,
                SamplingMetadata]:
+        multi_modal_input = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or
             # all decodes.
             is_prompt = seq_group_metadata_list[0].is_prompt
             # Prepare input tensors.
             if is_prompt:
-                (input_tokens, input_positions, attn_metadata,
-                 prompt_lens) = self._prepare_prompt(seq_group_metadata_list)
+                (input_tokens, input_positions, attn_metadata, prompt_lens,
+                 multi_modal_input
+                 ) = self._prepare_prompt(seq_group_metadata_list)
             else:
                 (input_tokens, input_positions,
                  attn_metadata) = self._prepare_decode(seq_group_metadata_list)
@@ -376,12 +391,8 @@ def prepare_input_tensors(
                 perform_sampling=False,
             )
 
-        return (
-            input_tokens,
-            input_positions,
-            attn_metadata,
-            sampling_metadata,
-        )
+        return (input_tokens, input_positions, attn_metadata,
+                sampling_metadata, multi_modal_input)
 
     @torch.inference_mode()
     def execute_model(
@@ -389,7 +400,8 @@ def execute_model(
         seq_group_metadata_list: List[SequenceGroupMetadata],
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata
+        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+         multi_modal_input
          ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         model_executable = self.model
@@ -399,6 +411,8 @@ def execute_model(
             "kv_caches": kv_caches,
             "attn_metadata": attn_metadata,
         }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
 
         hidden_states = model_executable(**execute_model_kwargs)
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3652830b7d519..83ededd742533 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -6,7 +6,8 @@
 
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig)
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
                               init_distributed_environment)
@@ -122,6 +123,7 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
     ) -> None:
@@ -135,21 +137,25 @@ def __init__(
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
+        self.vision_language_config = vision_language_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
+
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.model_runner = CPUModelRunner(model_config,
-                                           parallel_config,
-                                           scheduler_config,
-                                           device_config,
-                                           load_config=self.load_config,
-                                           lora_config=self.lora_config,
-                                           kv_cache_dtype=kv_cache_dtype,
-                                           is_driver_worker=is_driver_worker)
+        self.model_runner = CPUModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            load_config=self.load_config,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: CPUCacheEngine

From 81e4d263b531d28ad01f97422c12481d0cc09053 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 22 Apr 2024 00:54:16 -0700
Subject: [PATCH 063/106] [Bugfix] Fix type annotations in CPU model runner
 (#4256)

---
 vllm/worker/cpu_model_runner.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index a82373d3d1626..bf0a6c84e6f07 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -73,7 +73,8 @@ def load_model(self) -> None:
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               Optional[torch.Tensor]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
@@ -347,8 +348,8 @@ def _prepare_sample(
     def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata,
-               SamplingMetadata]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+               Optional[torch.Tensor]]:
         multi_modal_input = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or

From 7e236d1f680b0c6869c0e0a83cd138edbef3c71c Mon Sep 17 00:00:00 2001
From: Tao He <sighingnow@gmail.com>
Date: Mon, 22 Apr 2024 17:19:51 +0800
Subject: [PATCH 064/106] [Frontend] Enable support for CPU backend in
 AsyncLLMEngine. (#3993)

Signed-off-by: Tao He <sighingnow@gmail.com>
---
 vllm/engine/async_llm_engine.py |  5 +++++
 vllm/executor/cpu_executor.py   | 27 +++++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index ca4ba66f09cb8..3a2f7db679358 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -343,6 +343,11 @@ def from_engine_args(
         if engine_config.device_config.device_type == "neuron":
             from vllm.executor.neuron_executor import NeuronExecutorAsync
             executor_class = NeuronExecutorAsync
+        elif engine_config.device_config.device_type == "cpu":
+            assert not engine_config.parallel_config.worker_use_ray, (
+                "Ray is not supported with the CPU backend.")
+            from vllm.executor.cpu_executor import CPUExecutorAsync
+            executor_class = CPUExecutorAsync
         elif engine_config.parallel_config.worker_use_ray:
             initialize_ray_cluster(engine_config.parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 35249cd7302cb..8d6a1fff91fd8 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -4,11 +4,12 @@
 import torch
 
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
-from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
 
 logger = init_logger(__name__)
 
@@ -100,6 +101,28 @@ def check_health(self) -> None:
         return
 
 
+class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model)(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy)
+        return output
+
+    async def check_health_async(self) -> None:
+        # CPUExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
 def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
     if config.dtype == torch.float16:
         logger.warning("float16 is not supported on CPU, casting to bfloat16.")

From 0065ecb0efa540eccf8882b17d999b27210dc304 Mon Sep 17 00:00:00 2001
From: alexm-nm <59768536+alexm-nm@users.noreply.github.com>
Date: Mon, 22 Apr 2024 12:10:48 -0400
Subject: [PATCH 065/106] [Bugfix] Ensure download_weights_from_hf(..) inside
 loader is using the revision parameter (#4217)

---
 vllm/model_executor/model_loader/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 6c8cb2935f37e..64cd186506bdb 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -172,7 +172,7 @@ def _prepare_weights(self, model_name_or_path: str,
         if not is_local:
             hf_folder = download_weights_from_hf(model_name_or_path,
                                                  self.load_config.download_dir,
-                                                 allow_patterns)
+                                                 allow_patterns, revision)
         else:
             hf_folder = model_name_or_path
 

From 91e2e191e8734debd9bba49d2b6884894c044d83 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 22 Apr 2024 17:36:54 +0100
Subject: [PATCH 066/106] Add example scripts to documentation (#4225)

Co-authored-by: Harry Mellor <hmellor@oxts.com>
---
 .gitignore                                    |  2 +
 docs/source/conf.py                           |  9 ++-
 docs/source/generate_examples.py              | 61 +++++++++++++++++++
 .../examples/examples_index.template.rst      |  8 +++
 docs/source/index.rst                         |  1 +
 ...nt.py => openai_chat_completion_client.py} |  0
 6 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/generate_examples.py
 create mode 100644 docs/source/getting_started/examples/examples_index.template.rst
 rename examples/{openai_chatcompletion_client.py => openai_chat_completion_client.py} (100%)

diff --git a/.gitignore b/.gitignore
index f411b54cc3da3..06e8923ced311 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,6 +72,8 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/source/getting_started/examples/*.rst
+!**/*.template.rst
 
 # PyBuilder
 .pybuilder/
diff --git a/docs/source/conf.py b/docs/source/conf.py
index cfa956b143ba3..aac8cbb63ebeb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -48,7 +48,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = []
+exclude_patterns: List[str] = ["**/*.template.rst"]
 
 # Exclude the prompt "$" when copying code
 copybutton_prompt_text = r"\$ "
@@ -73,6 +73,13 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 # html_static_path = ['_static']
 
+
+# Generate additional rst documentation here.
+def setup(app):
+    from docs.source.generate_examples import generate_examples
+    generate_examples()
+
+
 # Mock out external dependencies here.
 autodoc_mock_imports = [
     "cpuinfo",
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
new file mode 100644
index 0000000000000..79b49a186236a
--- /dev/null
+++ b/docs/source/generate_examples.py
@@ -0,0 +1,61 @@
+import re
+from pathlib import Path
+
+
+def fix_case(text: str) -> str:
+    subs = [
+        ("api", "API"),
+        ("llm", "LLM"),
+        ("vllm", "vLLM"),
+        ("openai", "OpenAI"),
+        ("multilora", "MultiLoRA"),
+    ]
+    for sub in subs:
+        text = re.sub(*sub, text, flags=re.IGNORECASE)
+    return text
+
+
+def underline(title: str, character: str = "=") -> str:
+    return f"{title}\n{character * len(title)}"
+
+
+def generate_title(filename: str) -> str:
+    # Turn filename into a title
+    title = filename.replace("_", " ").title()
+    # Handle acronyms and names
+    title = fix_case(title)
+    # Underline title
+    title = underline(title)
+    return title
+
+
+def generate_examples():
+    root_dir = Path(__file__).parent.parent.parent.resolve()
+
+    # Source paths
+    script_dir = root_dir / "examples"
+    script_paths = sorted(script_dir.glob("*.py"))
+
+    # Destination paths
+    doc_dir = root_dir / "docs/source/getting_started/examples"
+    doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
+
+    # Generate the example docs for each example script
+    for script_path, doc_path in zip(script_paths, doc_paths):
+        script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}"
+        # Make script_path relative to doc_path and call it include_path
+        include_path = '../../../..' / script_path.relative_to(root_dir)
+        content = (f"{generate_title(doc_path.stem)}\n\n"
+                   f"Source {script_url}.\n\n"
+                   f".. literalinclude:: {include_path}\n"
+                   "    :language: python\n"
+                   "    :linenos:\n")
+        with open(doc_path, "w+") as f:
+            f.write(content)
+
+    # Generate the toctree for the example scripts
+    with open(doc_dir / "examples_index.template.rst") as f:
+        examples_index = f.read()
+    with open(doc_dir / "examples_index.rst", "w+") as f:
+        example_docs = "\n   ".join(path.stem for path in script_paths)
+        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
diff --git a/docs/source/getting_started/examples/examples_index.template.rst b/docs/source/getting_started/examples/examples_index.template.rst
new file mode 100644
index 0000000000000..1b34cccbae15a
--- /dev/null
+++ b/docs/source/getting_started/examples/examples_index.template.rst
@@ -0,0 +1,8 @@
+Examples
+=================================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Scripts
+
+   %EXAMPLE_DOCS%
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5d5d52696ba34..e8daa5f052754 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -65,6 +65,7 @@ Documentation
    getting_started/neuron-installation
    getting_started/cpu-installation
    getting_started/quickstart
+   getting_started/examples/examples_index
 
 .. toctree::
    :maxdepth: 1
diff --git a/examples/openai_chatcompletion_client.py b/examples/openai_chat_completion_client.py
similarity index 100%
rename from examples/openai_chatcompletion_client.py
rename to examples/openai_chat_completion_client.py

From 542dc7054e5f3cafe19a36bb6c885edea8735039 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Tue, 23 Apr 2024 06:11:06 +0900
Subject: [PATCH 067/106] [Core] Scheduler perf fix (#4270)

---
 tests/core/test_scheduler.py | 18 +++++++++---------
 vllm/core/scheduler.py       |  7 ++-----
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 9588a1bead5f6..a2511238506b0 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -540,7 +540,7 @@ def test_decode_schedule_preempted():
     curr_loras = None
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         running.append(seq_group)
     scheduler.block_manager.can_append_slots = MagicMock()
@@ -581,7 +581,7 @@ def test_decode_swap_beam_search():
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
         running.append(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         budget.add_num_seqs(seq_group.request_id,
@@ -629,7 +629,7 @@ def test_schedule_decode_blocks_to_copy_update():
     running = deque()
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
-    scheduler._allocate_and_set_running(seq_group, 60)
+    scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
     running.append(seq_group)
 
@@ -659,7 +659,7 @@ def test_schedule_swapped_simple():
     curr_loras = None
     blocks_to_swap_out = {}
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    scheduler._allocate_and_set_running(seq_group, 60)
+    scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
     scheduler._swap_out(seq_group, blocks_to_swap_out)
     swapped.append(seq_group)
@@ -687,7 +687,7 @@ def test_schedule_swapped_max_token_budget():
     blocks_to_swap_out = {}
     for _ in range(2):
         _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
         swapped.append(seq_group)
@@ -721,7 +721,7 @@ def test_schedule_swapped_max_seqs():
     blocks_to_swap_out = {}
     for i in range(4):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
         swapped.append(seq_group)
@@ -759,7 +759,7 @@ def test_schedule_swapped_max_loras():
                                                lora_name=str(i),
                                                lora_int_id=i + 1,
                                                lora_local_path="abc"))
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
         swapped.append(seq_group)
@@ -783,7 +783,7 @@ def test_schedule_swapped_cannot_swap_in():
     blocks_to_swap_out = {}
     for _ in range(2):
         _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
         swapped.append(seq_group)
@@ -808,7 +808,7 @@ def test_schedule_swapped_blocks_to_copy():
     policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    scheduler._allocate_and_set_running(seq_group, 60)
+    scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
     blocks_to_swap_out = {}
     scheduler._swap_out(seq_group, blocks_to_swap_out)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 76379cf82bd1e..a302557d81fa8 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -297,7 +297,6 @@ def num_decoding_tokens_per_seq(self) -> int:
 
     def add_seq_group(self, seq_group: SequenceGroup) -> None:
         # Add sequence groups to the waiting queue.
-        logger.debug(f"add_seq_group {seq_group.request_id}")
         self.waiting.append(seq_group)
 
     def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
@@ -427,7 +426,6 @@ def _schedule_running(
                         swapped_out.append(seq_group)
                     break
             else:
-                logger.debug(f"append slot for {seq_group}")
                 self._append_slots(seq_group, blocks_to_copy)
                 is_prefill = seq_group.is_prefill()
                 if is_prefill:
@@ -659,7 +657,7 @@ def _schedule_prefills(
             if curr_loras is not None and lora_int_id > 0:
                 curr_loras.add(lora_int_id)
             waiting_queue.popleft()
-            self._allocate_and_set_running(seq_group, num_new_tokens)
+            self._allocate_and_set_running(seq_group)
             seq_groups.append(
                 ScheduledSequenceGroup(seq_group=seq_group,
                                        token_chunk_size=num_new_tokens))
@@ -973,8 +971,7 @@ def free_finished_seq_groups(self) -> None:
         self.running = deque(seq_group for seq_group in self.running
                              if not seq_group.is_finished())
 
-    def _allocate_and_set_running(self, seq_group: SequenceGroup,
-                                  num_new_tokens: int) -> None:
+    def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
         self.block_manager.allocate(seq_group)
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             seq.status = SequenceStatus.RUNNING

From 949c8049589a9f36a8a0f5f2b42616a635f6e43a Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Mon, 22 Apr 2024 15:34:31 -0700
Subject: [PATCH 068/106] [Doc] Update the SkyPilot doc with serving and
 Llama-3 (#4276)

---
 docs/source/serving/run_on_sky.rst | 287 ++++++++++++++++++++++++++---
 1 file changed, 264 insertions(+), 23 deletions(-)

diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst
index 2c88d24dc5d0b..bd33c76cec3de 100644
--- a/docs/source/serving/run_on_sky.rst
+++ b/docs/source/serving/run_on_sky.rst
@@ -1,7 +1,7 @@
 .. _on_cloud:
 
-Running on clouds with SkyPilot
-===============================
+Deploying and scaling up with SkyPilot
+================================================
 
 .. raw:: html
 
@@ -9,51 +9,75 @@ Running on clouds with SkyPilot
         <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
     </p>
 
-vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud.
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery <https://skypilot.readthedocs.io/en/latest/gallery/index.html>`__.
 
-To install SkyPilot and setup your cloud credentials, run:
+
+Prerequisites
+-------------
+
+- Go to the `HuggingFace model page <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Check that you have installed SkyPilot (`docs <https://skypilot.readthedocs.io/en/latest/getting-started/installation.html>`__).
+- Check that :code:`sky check` shows clouds or Kubernetes are enabled.
 
 .. code-block:: console
 
-    $ pip install skypilot
-    $ sky check
+    pip install skypilot-nightly
+    sky check
+
+
+Run on a single instance
+------------------------
 
 See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
 
 .. code-block:: yaml
 
     resources:
-        accelerators: A100
+        accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+        use_spot: True
+        disk_size: 512  # Ensure model checkpoints can fit.
+        disk_tier: best
+        ports: 8081  # Expose to internet traffic.
 
     envs:
-        MODEL_NAME: decapoda-research/llama-13b-hf
-        TOKENIZER: hf-internal-testing/llama-tokenizer
+        MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+        HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
 
     setup: |
-        conda create -n vllm python=3.9 -y
+        conda create -n vllm python=3.10 -y
         conda activate vllm
-        git clone https://github.com/vllm-project/vllm.git
-        cd vllm
-        pip install .
-        pip install gradio
+
+        pip install vllm==0.4.0.post1
+        # Install Gradio for web UI.
+        pip install gradio openai
+        pip install flash-attn==2.5.7
 
     run: |
         conda activate vllm
         echo 'Starting vllm api server...'
-        python -u -m vllm.entrypoints.api_server \
-                        --model $MODEL_NAME \
-                        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-                        --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
+        python -u -m vllm.entrypoints.openai.api_server \
+            --port 8081 \
+            --model $MODEL_NAME \
+            --trust-remote-code \
+            --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+            2>&1 | tee api_server.log &
+        
         echo 'Waiting for vllm api server to start...'
         while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
         echo 'Starting gradio server...'
-        python vllm/examples/gradio_webserver.py
+        git clone https://github.com/vllm-project/vllm.git || true
+        python vllm/examples/gradio_openai_chatbot_webserver.py \
+            -m $MODEL_NAME \
+            --port 8811 \
+            --model-url http://localhost:8081/v1 \
+            --stop-token-ids 128009,128001
 
-Start the serving the LLaMA-13B model on an A100 GPU:
+Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): 
 
 .. code-block:: console
 
-    $ sky launch serving.yaml
+    HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
 
 Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
 
@@ -61,9 +85,226 @@ Check the output of the command. There will be a shareable gradio link (like the
 
     (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
 
-**Optional**: Serve the 65B model instead of the default 13B and use more GPU:
+**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
+
+.. code-block:: console
+
+    HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+
+
+Scale up to multiple replicas
+-----------------------------
+
+SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
+
+.. code-block:: yaml
+
+    service:
+        replicas: 2
+        # An actual request for readiness probe.
+        readiness_probe:
+            path: /v1/chat/completions
+            post_data:
+            model: $MODEL_NAME
+            messages:
+                - role: user
+                content: Hello! What is your name?
+        max_tokens: 1
+        
+.. raw:: html
+
+    <details>
+    <summary>Click to see the full recipe YAML</summary>
+
+
+.. code-block:: yaml
+
+    service:
+        replicas: 2
+        # An actual request for readiness probe.
+        readiness_probe:
+            path: /v1/chat/completions
+            post_data:
+            model: $MODEL_NAME
+            messages:
+                - role: user
+                content: Hello! What is your name?
+        max_tokens: 1
+
+    resources:
+        accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+        use_spot: True
+        disk_size: 512  # Ensure model checkpoints can fit.
+        disk_tier: best
+        ports: 8081  # Expose to internet traffic.
+
+    envs:
+        MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+        HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+    setup: |
+        conda create -n vllm python=3.10 -y
+        conda activate vllm
+
+        pip install vllm==0.4.0.post1
+        # Install Gradio for web UI.
+        pip install gradio openai
+        pip install flash-attn==2.5.7
+
+    run: |
+        conda activate vllm
+        echo 'Starting vllm api server...'
+        python -u -m vllm.entrypoints.openai.api_server \
+            --port 8081 \
+            --model $MODEL_NAME \
+            --trust-remote-code \
+            --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+            2>&1 | tee api_server.log &
+        
+        echo 'Waiting for vllm api server to start...'
+        while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+        echo 'Starting gradio server...'
+        git clone https://github.com/vllm-project/vllm.git || true
+        python vllm/examples/gradio_openai_chatbot_webserver.py \
+            -m $MODEL_NAME \
+            --port 8811 \
+            --model-url http://localhost:8081/v1 \
+            --stop-token-ids 128009,128001
+
+.. raw:: html
+
+    </details>
+
+Start the serving the Llama-3 8B model on multiple replicas:
+
+.. code-block:: console
+
+    HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+
+
+Wait until the service is ready:
 
 .. code-block:: console
 
-    sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf
+    watch -n10 sky serve status vllm
+
+
+.. raw:: html
+
+    <details>
+    <summary>Example outputs:</summary>
+
+.. code-block:: console
+
+    Services
+    NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+    vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
+
+    Service Replicas
+    SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES          STATUS  REGION
+    vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP({'L4': 1})  READY   us-east4
+    vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP({'L4': 1})  READY   us-east4
+
+.. raw:: html
+    
+    </details>
+
+After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
+
+.. code-block:: console
+
+    ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+    curl -L http://$ENDPOINT/v1/chat/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant."
+            },
+            {
+                "role": "user",
+                "content": "Who are you?"
+            }
+            ],
+            "stop_token_ids": [128009,  128001]
+        }'
+
+To enable autoscaling, you could specify additional configs in `services`:
+
+.. code-block:: yaml
+
+    services:
+        replica_policy:
+            min_replicas: 0
+            max_replicas: 3
+        target_qps_per_replica: 2
+
+This will scale the service up to when the QPS exceeds 2 for each replica.
+
+
+**Optional**: Connect a GUI to the endpoint
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
+
+.. raw:: html
+
+    <details>
+    <summary>Click to see the full GUI YAML</summary>
+
+.. code-block:: yaml
+
+    envs:
+        MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
+        ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. 
+
+    resources:
+        cpus: 2
+
+    setup: |
+        conda activate vllm
+        if [ $? -ne 0 ]; then
+            conda create -n vllm python=3.10 -y
+            conda activate vllm
+        fi
+
+        # Install Gradio for web UI.
+        pip install gradio openai
+
+    run: |
+        conda activate vllm
+        export PATH=$PATH:/sbin
+        WORKER_IP=$(hostname -I | cut -d' ' -f1)
+        CONTROLLER_PORT=21001
+        WORKER_PORT=21002
+
+        echo 'Starting gradio server...'
+        git clone https://github.com/vllm-project/vllm.git || true
+        python vllm/examples/gradio_openai_chatbot_webserver.py \
+            -m $MODEL_NAME \
+            --port 8811 \
+            --model-url http://$ENDPOINT/v1 \
+            --stop-token-ids 128009,128001 | tee ~/gradio.log
+
+.. raw:: html
+    
+    </details>
+
+1. Start the chat web UI:
+
+.. code-block:: console
+
+    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+
+
+2. Then, we can access the GUI at the returned gradio link:
+
+.. code-block:: console
+
+    | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+
 

From 2c5f36574dc2f359d55dcca903bcf0b59b15a1f2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 22 Apr 2024 17:21:48 -0700
Subject: [PATCH 069/106] [Core][Distributed] use absolute path for library
 file (#4271)

---
 vllm/utils.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 3054f3d27d1d6..eda690e72829f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -553,6 +553,34 @@ def nccl_integrity_check(filepath):
     return version.value
 
 
+@lru_cache(maxsize=None)
+def find_library(lib_name: str) -> str:
+    """
+    Find the library file in the system.
+    `lib_name` is full filename, with both prefix and suffix.
+    This function resolves `lib_name` to the full path of the library.
+    """
+    # Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
+    # According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
+    # `/sbin/ldconfig` should exist in all Linux systems.
+    # `/sbin/ldconfig` searches the library in the system
+    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
+    # each line looks like the following:
+    # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
+    locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
+    # `LD_LIBRARY_PATH` searches the library in the user-defined paths
+    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    if not locs and env_ld_library_path:
+        locs = [
+            os.path.join(dir, lib_name)
+            for dir in env_ld_library_path.split(":")
+            if os.path.exists(os.path.join(dir, lib_name))
+        ]
+    if not locs:
+        raise ValueError(f"Cannot find {lib_name} in the system.")
+    return locs[0]
+
+
 def find_nccl_library():
     so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
 
@@ -572,9 +600,9 @@ def find_nccl_library():
         )
     else:
         if torch.version.cuda is not None:
-            so_file = vllm_nccl_path or "libnccl.so.2"
+            so_file = vllm_nccl_path or find_library("libnccl.so.2")
         elif torch.version.hip is not None:
-            so_file = "librccl.so.1"
+            so_file = find_library("librccl.so.1")
         else:
             raise ValueError("NCCL only supports CUDA and ROCm backends.")
         logger.info(f"Found nccl from library {so_file}")

From 9b6f4f803711b8741485307ce79828edd504df31 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 23 Apr 2024 02:53:01 +0100
Subject: [PATCH 070/106] Fix `autodoc` directives (#4272)

Co-authored-by: Harry Mellor <hmellor@oxts.com>
---
 docs/source/dev/engine/async_llm_engine.rst | 5 ++---
 docs/source/dev/engine/llm_engine.rst       | 6 +++---
 docs/source/dev/sampling_params.rst         | 3 ++-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.rst
index 47db1e0a401b1..93fc310cb543b 100644
--- a/docs/source/dev/engine/async_llm_engine.rst
+++ b/docs/source/dev/engine/async_llm_engine.rst
@@ -1,7 +1,6 @@
-
 AsyncLLMEngine
 =================================
 
-.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
-    :members: generate, abort
+.. autoclass:: vllm.AsyncLLMEngine
+    :members:
     :show-inheritance:
diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst
index 1de6d7adc87c6..0b8c1e219d7c9 100644
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.rst
@@ -1,6 +1,6 @@
 LLMEngine
 =================================
 
-.. autoclass:: vllm.engine.llm_engine.LLMEngine
-    :members: add_request, abort_request, step
-    :show-inheritance:
\ No newline at end of file
+.. autoclass:: vllm.LLMEngine
+    :members:
+    :show-inheritance:
diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.rst
index 844859b3ec1f0..ef3d1509bda6d 100644
--- a/docs/source/dev/sampling_params.rst
+++ b/docs/source/dev/sampling_params.rst
@@ -1,4 +1,5 @@
 Sampling Params
 ===============
 
-.. automodule:: vllm.sampling_params.SamplingParams
\ No newline at end of file
+.. autoclass:: vllm.SamplingParams
+    :members:

From fccd494e45c876305fb33f30c05651672bc6fb59 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Tue, 23 Apr 2024 13:32:44 +0900
Subject: [PATCH 071/106] [Mypy] Part 3 fix typing for nested directories for
 most of directory (#4161)

---
 .github/workflows/mypy.yaml                   | 29 ++++++++++---------
 format.sh                                     | 26 ++++++++---------
 pyproject.toml                                |  6 ++--
 vllm/attention/backends/abstract.py           |  2 +-
 vllm/attention/backends/rocm_flash_attn.py    |  1 +
 vllm/attention/backends/torch_sdpa.py         |  3 +-
 vllm/attention/backends/xformers.py           |  1 +
 vllm/core/block/block_table.py                |  1 +
 vllm/core/block/common.py                     |  6 ++--
 vllm/core/block/interfaces.py                 |  6 ++--
 .../device_communicators/custom_all_reduce.py | 16 +++++-----
 .../device_communicators/pynccl.py            | 22 ++++++++------
 .../device_communicators/pynccl_utils.py      |  5 +++-
 vllm/engine/output_processor/interfaces.py    |  5 ++--
 vllm/engine/output_processor/multi_step.py    |  5 ++--
 vllm/engine/output_processor/single_step.py   |  9 +++---
 vllm/engine/output_processor/util.py          |  4 ++-
 vllm/entrypoints/openai/api_server.py         |  6 ++--
 vllm/entrypoints/openai/protocol.py           | 13 +++++----
 vllm/entrypoints/openai/serving_chat.py       |  2 +-
 vllm/entrypoints/openai/serving_completion.py |  5 +++-
 vllm/entrypoints/openai/serving_engine.py     | 19 +++++++-----
 vllm/lora/lora.py                             |  2 +-
 vllm/model_executor/layers/ops/sample.py      |  3 +-
 .../model_executor/layers/rotary_embedding.py |  3 +-
 vllm/transformers_utils/configs/jais.py       |  6 ++--
 .../tokenizer_group/__init__.py               |  2 +-
 .../tokenizer_group/ray_tokenizer_group.py    |  2 ++
 .../transformers_utils/tokenizers/baichuan.py |  4 +--
 29 files changed, 126 insertions(+), 88 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 477ce9bc9ce85..9f1855696e20a 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,19 +32,20 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
-        mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/attention --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
         mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
-
-        mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
-        # TODO(sang): Follow up
-        # mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
+        mypy vllm/model_executor/*.py  --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
+        # mypy vllm/lora/*.py --config-file pyproject.toml
 
diff --git a/format.sh b/format.sh
index 84ee88b5b4c8a..bd2e9e89e1806 100755
--- a/format.sh
+++ b/format.sh
@@ -94,21 +94,19 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
-mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/attention --config-file pyproject.toml
 mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
-
-# TODO(sang): Follow up
-mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/entrypoints --config-file pyproject.toml
+mypy vllm/executor --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/transformers_utils --config-file pyproject.toml
+mypy vllm/engine  --config-file pyproject.toml
+mypy vllm/worker --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
+mypy vllm/model_executor/*.py  --config-file pyproject.toml
+# mypy vllm/lora/*.py --config-file pyproject.toml
 
 
 CODESPELL_EXCLUDES=(
diff --git a/pyproject.toml b/pyproject.toml
index 805170719e50a..21cdeb6ef0280 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,15 +45,17 @@ ignore = [
 python_version = "3.8"
 
 ignore_missing_imports = true
- check_untyped_defs = true
+check_untyped_defs = true
+follow_imports = "skip"
 
 files = "vllm"
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 exclude = [
     "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+    # Ignore triton kernels in ops.
+    'vllm/attention/ops/.*\.py$'
 ]
 
-
 [tool.codespell]
 ignore-words-list = "dout, te, indicies"
 skip = "./tests/prompts,./benchmarks/sonnet.txt"
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 7a4ccecf702f4..be747c9900368 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -116,7 +116,7 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata[AttentionMetadataPerStage],
+        attn_metadata: AttentionMetadata,
         kv_scale: float,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index dbaa71fd16add..7c5863a030ff5 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -248,6 +248,7 @@ def forward(
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
+            assert prefill_meta.prompt_lens is not None
             if kv_cache is None or prefill_meta.block_tables.numel() == 0:
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index d21b54b16db4b..55a7ce59ac6e0 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -106,7 +106,7 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
-        attn_metadata: TorchSDPAMetadata,
+        attn_metadata: TorchSDPAMetadata,  # type: ignore
         kv_scale: float,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -136,6 +136,7 @@ def forward(
                                                 kv_scale)
 
         if attn_metadata.is_prompt:
+            assert attn_metadata.prompt_lens is not None
             if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
                 if self.num_kv_heads != self.num_heads:
                     key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index b745a04a143b4..572a4dc79a719 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -288,6 +288,7 @@ def _run_memory_efficient_xformers_forward(
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         """
+        assert attn_metadata.prompt_lens is not None
         original_query = query
         if self.num_kv_heads != self.num_heads:
             # GQA/MQA requires the shape [B, M, G, H, K].
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 560267e55ea3a..f1b65b2514f76 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -104,6 +104,7 @@ def append_token_ids(self,
             token_ids (List[int]): The sequence of token IDs to be appended.
         """
         assert self._is_allocated
+        assert self._blocks is not None
 
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
                                     num_lookahead_slots)
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 50c70533c4fbc..f11234a0bf2dd 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -99,7 +99,7 @@ def __init__(
         refcounter: RefCounter,
         allocator: BlockAllocator,
     ):
-        self._copy_on_writes = defaultdict(list)
+        self._copy_on_writes: Dict[BlockId, List[BlockId]] = defaultdict(list)
         self._refcounter = refcounter
         self._allocator = allocator
 
@@ -138,6 +138,8 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
                 prev_block=block.prev_block).block_id
 
             # Track src/dst copy.
+            assert src_block_id is not None
+            assert block_id is not None
             self._copy_on_writes[src_block_id].append(block_id)
 
         return block_id
@@ -180,6 +182,6 @@ def recurse(block: Block, lst: List[Block]) -> None:
             recurse(block.prev_block, lst)
         lst.append(block)
 
-    all_blocks = []
+    all_blocks: List[Block] = []
     recurse(last_block, all_blocks)
     return all_blocks
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index fbceacf0ec417..50ce922118124 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -52,8 +52,7 @@ def __call__(
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
         pass
 
     @abstractmethod
@@ -98,8 +97,7 @@ class NoFreeBlocksError(ValueError):
 class DeviceAwareBlockAllocator(BlockAllocator):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
         pass
 
     @abstractmethod
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 58cbe77baf7e0..9dbb427d91ff1 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,6 +1,6 @@
 import os
 from contextlib import contextmanager
-from typing import List, Optional
+from typing import Any, List, Optional
 
 import torch
 import torch.distributed as dist
@@ -18,7 +18,7 @@
 
 logger = init_logger(__name__)
 
-_CA_HANDLE = None
+_CA_HANDLE: Optional["CustomAllreduce"] = None
 _IS_CAPTURING = False
 _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
 
@@ -51,7 +51,7 @@ def init_custom_ar() -> None:
             "Cannot test GPU P2P because not all GPUs are visible to the "
             "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
             " is set.")
-        return False
+        return
     # test nvlink first, this will filter out most of the cases
     # where custom allreduce is not supported
     if "CUDA_VISIBLE_DEVICES" in os.environ:
@@ -117,7 +117,7 @@ def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
     ca_handle = get_handle()
     # when custom allreduce is disabled, this will be None
     if ca_handle is None:
-        return
+        return None
     if is_capturing():
         if torch.cuda.is_current_stream_capturing():
             if ca_handle.should_custom_ar(input):
@@ -135,6 +135,8 @@ def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
         if ca_handle.should_custom_ar(input):
             return ca_handle.all_reduce_unreg(input)
 
+    return None
+
 
 @contextmanager
 def _nvml():
@@ -224,14 +226,14 @@ def _get_ipc_meta(self, inp: torch.Tensor):
         return self._gather_ipc_meta(shard_data)
 
     def _gather_ipc_meta(self, shard_data):
-        all_data = [None] * self.world_size
+        all_data: List[Optional[Any]] = [None] * self.world_size
         dist.all_gather_object(all_data, shard_data)
 
         handles = []
         offsets = []
         for i in range(len(all_data)):
-            handles.append(all_data[i][0])
-            offsets.append(all_data[i][1])
+            handles.append(all_data[i][0])  # type: ignore
+            offsets.append(all_data[i][1])  # type: ignore
         return handles, offsets
 
     def register_buffer(self, inp: torch.Tensor):
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index c57a4f59d442c..0707afe922f40 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -107,9 +107,10 @@ def ncclGetUniqueId() -> NcclUniqueId:
     ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int
 ]
 
+ncclDataType_t = ctypes.c_int
 
-# enums
-class ncclDataType_t(ctypes.c_int):
+
+class ncclDataTypeEnum:
     ncclInt8 = 0
     ncclChar = 0
     ncclUint8 = 1
@@ -128,7 +129,7 @@ class ncclDataType_t(ctypes.c_int):
     ncclNumTypes = 10
 
     @classmethod
-    def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t':
+    def from_torch(cls, dtype: torch.dtype) -> int:
         if dtype == torch.int8:
             return cls.ncclInt8
         if dtype == torch.uint8:
@@ -148,7 +149,10 @@ def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t':
         raise ValueError(f"Unsupported dtype: {dtype}")
 
 
-class ncclRedOp_t(ctypes.c_int):
+ncclRedOp_t = ctypes.c_int
+
+
+class ncclRedOpTypeEnum:
     ncclSum = 0
     ncclProd = 1
     ncclMax = 2
@@ -157,7 +161,7 @@ class ncclRedOp_t(ctypes.c_int):
     ncclNumOps = 5
 
     @classmethod
-    def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
+    def from_torch(cls, op: ReduceOp) -> int:
         if op == ReduceOp.SUM:
             return cls.ncclSum
         if op == ReduceOp.PRODUCT:
@@ -180,8 +184,8 @@ def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
 _c_ncclAllReduce = nccl.ncclAllReduce
 _c_ncclAllReduce.restype = ctypes.c_int
 _c_ncclAllReduce.argtypes = [
-    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t,
-    ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclRedOp_t,
+    ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
 ]
 
 # equivalent to c declaration:
@@ -251,8 +255,8 @@ def all_reduce(self,
         result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
                                   ctypes.c_void_p(tensor.data_ptr()),
                                   tensor.numel(),
-                                  ncclDataType_t.from_torch(tensor.dtype),
-                                  ncclRedOp_t.from_torch(op), self.comm,
+                                  ncclDataTypeEnum.from_torch(tensor.dtype),
+                                  ncclRedOpTypeEnum.from_torch(op), self.comm,
                                   ctypes.c_void_p(stream.cuda_stream))
         assert result == 0
 
diff --git a/vllm/distributed/device_communicators/pynccl_utils.py b/vllm/distributed/device_communicators/pynccl_utils.py
index aeb73015733d1..916dc814af7eb 100644
--- a/vllm/distributed/device_communicators/pynccl_utils.py
+++ b/vllm/distributed/device_communicators/pynccl_utils.py
@@ -30,6 +30,7 @@ def is_initialized() -> bool:
 def set_pynccl_stream(stream: torch.cuda.Stream):
     """Set the cuda stream for communication"""
     try:
+        assert comm is not None
         comm.stream = stream
         yield
     finally:
@@ -52,6 +53,7 @@ def init_process_group(world_size: int,
 def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
     """All-reduces the input tensor across the process group."""
     assert input_.is_cuda, f"{input_} should be a cuda tensor"
+    assert comm is not None
     comm.all_reduce(input_, op)
 
 
@@ -62,8 +64,9 @@ def destroy_process_group() -> None:
 
 def get_world_size() -> int:
     """Returns the world size."""
+    assert comm is not None
     return comm.world_size
 
 
-def get_nccl_backend():
+def get_nccl_backend() -> Optional["NCCLCommunicator"]:
     return comm
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 9ddac7a04cb36..f307ea4da3011 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Callable, Iterable, List
+from typing import Callable, List
 
 from transformers import PreTrainedTokenizer
 
@@ -8,6 +8,7 @@
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
 from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
 
 
 class SequenceGroupOutputProcessor(ABC):
@@ -27,7 +28,7 @@ def create_output_processor(
         scheduler_config: SchedulerConfig,
         detokenizer: Detokenizer,
         scheduler: Scheduler,
-        seq_counter: Iterable[int],
+        seq_counter: Counter,
         get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
         stop_checker: "StopChecker",
     ):
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 50da0d35fcec1..39e99d06ed875 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, List
+from typing import Callable, List
 
 from transformers import PreTrainedTokenizer
 
@@ -11,6 +11,7 @@
 from vllm.sequence import (Logprob, Sequence, SequenceGroup,
                            SequenceGroupOutput, SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
 
 logger = init_logger(__name__)
 
@@ -33,7 +34,7 @@ def __init__(
         self,
         detokenizer: Detokenizer,
         scheduler: Scheduler,
-        seq_counter: Iterable[int],
+        seq_counter: Counter,
         get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
         stop_checker: StopChecker,
     ):
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index b32937327ba7f..7e9d652446703 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 from vllm.config import SchedulerConfig
 from vllm.core.scheduler import Scheduler
@@ -10,6 +10,7 @@
 from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
                            SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
 
 logger = init_logger(__name__)
 
@@ -33,7 +34,7 @@ def __init__(
         scheduler_config: SchedulerConfig,
         detokenizer: Detokenizer,
         scheduler: Scheduler,
-        seq_counter: Iterable[int],
+        seq_counter: Counter,
         stop_checker: StopChecker,
     ):
         self.scheduler_config = scheduler_config
@@ -69,7 +70,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
         samples = outputs.samples
         parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
         existing_finished_seqs = seq_group.get_finished_seqs()
-        parent_child_dict = {
+        parent_child_dict: Dict[int, List[SequenceOutput]] = {
             parent_seq.seq_id: []
             for parent_seq in parent_seqs
         }
@@ -92,7 +93,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                 continue
             # Fork the parent sequence if there are multiple child samples.
             for child_sample in child_samples[:-1]:
-                new_child_seq_id = next(self.seq_counter)
+                new_child_seq_id: int = next(self.seq_counter)
                 child = parent.fork(new_child_seq_id)
                 child.append_token_id(child_sample.output_token,
                                       child_sample.logprobs)
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
index 5fbb09a857a46..d076fee8c2a36 100644
--- a/vllm/engine/output_processor/util.py
+++ b/vllm/engine/output_processor/util.py
@@ -8,7 +8,9 @@ def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput],
     """Helper method which transforms a 2d list organized by
     [step][sequence group] into [sequence group][step].
     """
-    output_by_sequence_group = [[] for _ in range(num_seq_groups)]
+    output_by_sequence_group: List[List[SamplerOutput]] = [
+        [] for _ in range(num_seq_groups)
+    ]
     for step in sampler_outputs:
         for i, sequence_group_output in enumerate(step):
             output_by_sequence_group[i].append(sequence_group_output)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index d6673976bb775..37d76b8e74055 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -18,6 +18,7 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionResponse,
                                               CompletionRequest, ErrorResponse)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -26,8 +27,8 @@
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
-openai_serving_chat: OpenAIServingChat = None
-openai_serving_completion: OpenAIServingCompletion = None
+openai_serving_chat: OpenAIServingChat
+openai_serving_completion: OpenAIServingCompletion
 logger = init_logger(__name__)
 
 
@@ -95,6 +96,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
         return StreamingResponse(content=generator,
                                  media_type="text/event-stream")
     else:
+        assert isinstance(generator, ChatCompletionResponse)
         return JSONResponse(content=generator.model_dump())
 
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index cf779d44c816b..d9763d024eb83 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -4,7 +4,8 @@
 from typing import Dict, List, Literal, Optional, Union
 
 import torch
-from pydantic import BaseModel, Field, conint, model_validator
+from pydantic import BaseModel, Field, model_validator
+from typing_extensions import Annotated
 
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
@@ -30,7 +31,7 @@ class ModelPermission(BaseModel):
     allow_fine_tuning: bool = False
     organization: str = "*"
     group: Optional[str] = None
-    is_blocking: str = False
+    is_blocking: bool = False
 
 
 class ModelCard(BaseModel):
@@ -56,7 +57,7 @@ class UsageInfo(BaseModel):
 
 class ResponseFormat(BaseModel):
     # type must be "json_object" or "text"
-    type: str = Literal["text", "json_object"]
+    type: Literal["text", "json_object"]
 
 
 class ChatCompletionRequest(BaseModel):
@@ -152,6 +153,7 @@ def to_sampling_params(self) -> SamplingParams:
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
+                assert self.logit_bias is not None
                 for token_id, bias in self.logit_bias.items():
                     # Clamp the bias between -100 and 100 per OpenAI API spec
                     bias = min(100, max(-100, bias))
@@ -213,7 +215,7 @@ class CompletionRequest(BaseModel):
     logit_bias: Optional[Dict[str, float]] = None
     logprobs: Optional[int] = None
     max_tokens: Optional[int] = 16
-    n: Optional[int] = 1
+    n: int = 1
     presence_penalty: Optional[float] = 0.0
     seed: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
@@ -235,7 +237,7 @@ class CompletionRequest(BaseModel):
     min_tokens: Optional[int] = 0
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
-    truncate_prompt_tokens: Optional[conint(ge=1)] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     # doc: end-completion-sampling-params
 
     # doc: begin-completion-extra-params
@@ -289,6 +291,7 @@ def to_sampling_params(self):
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
+                assert self.logit_bias is not None
                 for token_id, bias in self.logit_bias.items():
                     # Clamp the bias between -100 and 100 per OpenAI API spec
                     bias = min(100, max(-100, bias))
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index f35eab15bc487..d502dd0a4eb75 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -115,12 +115,12 @@ async def chat_completion_stream_generator(
         first_iteration = True
 
         # Send response for each token for each request.n (index)
+        assert request.n is not None
         previous_texts = [""] * request.n
         previous_num_tokens = [0] * request.n
         finish_reason_sent = [False] * request.n
         try:
             async for res in result_generator:
-                res: RequestOutput
                 # We need to do it here, because if there are exceptions in
                 # the result_generator, it needs to be sent as the FIRST
                 # response (by the try...catch).
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b7e2530a69b51..211b2e0424c3e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -185,6 +185,7 @@ async def completion_stream_generator(
         model_name: str,
         num_prompts: int,
     ) -> AsyncGenerator[str, None]:
+        assert request.n is not None
         previous_texts = [""] * request.n * num_prompts
         previous_num_tokens = [0] * request.n * num_prompts
         has_echoed = [False] * request.n * num_prompts
@@ -202,6 +203,7 @@ async def completion_stream_generator(
                     # TODO(simon): optimize the performance by avoiding full
                     # text O(n^2) sending.
 
+                    assert request.max_tokens is not None
                     if request.echo and request.max_tokens == 0:
                         # only return the prompt
                         delta_text = res.prompt
@@ -279,7 +281,7 @@ def request_output_to_completion_response(
         created_time: int,
         model_name: str,
     ) -> CompletionResponse:
-        choices = []
+        choices: List[CompletionResponseChoice] = []
         num_prompt_tokens = 0
         num_generated_tokens = 0
         for final_res in final_res_batch:
@@ -289,6 +291,7 @@ def request_output_to_completion_response(
             prompt_text = final_res.prompt
 
             for output in final_res.outputs:
+                assert request.max_tokens is not None
                 if request.echo and request.max_tokens == 0:
                     token_ids = prompt_token_ids
                     top_logprobs = prompt_logprobs
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 81a15417c17ee..f7073feb2f3d1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -4,7 +4,9 @@
 from http import HTTPStatus
 from typing import Dict, List, Optional, Tuple, Union
 
-from pydantic import conint
+from pydantic import Field
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from typing_extensions import Annotated
 
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -45,7 +47,8 @@ def __init__(self,
             ]
 
         self.max_model_len = 0
-        self.tokenizer = None
+        # Lazy initialized
+        self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 
         try:
             event_loop = asyncio.get_running_loop()
@@ -100,7 +103,7 @@ async def show_available_models(self) -> ModelList:
     def _create_logprobs(
         self,
         token_ids: List[int],
-        top_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None,
+        top_logprobs: List[Optional[Dict[int, Logprob]]],
         num_output_top_logprobs: Optional[int] = None,
         initial_text_offset: int = 0,
     ) -> LogProbs:
@@ -116,6 +119,7 @@ def _create_logprobs(
                 token = self.tokenizer.decode(token_id)
                 logprobs.tokens.append(token)
                 logprobs.token_logprobs.append(None)
+                assert logprobs.top_logprobs is not None
                 logprobs.top_logprobs.append(None)
             else:
                 token_logprob = step_top_logprobs[token_id].logprob
@@ -124,6 +128,7 @@ def _create_logprobs(
                 logprobs.token_logprobs.append(token_logprob)
 
                 if num_output_top_logprobs:
+                    assert logprobs.top_logprobs is not None
                     logprobs.top_logprobs.append({
                         # Convert float("-inf") to the
                         # JSON-serializable float that OpenAI uses
@@ -163,9 +168,9 @@ def create_streaming_error_response(
 
     async def _check_model(self, request) -> Optional[ErrorResponse]:
         if request.model in self.served_model_names:
-            return
+            return None
         if request.model in [lora.lora_name for lora in self.lora_requests]:
-            return
+            return None
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
@@ -173,7 +178,7 @@ async def _check_model(self, request) -> Optional[ErrorResponse]:
 
     def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
         if request.model in self.served_model_names:
-            return
+            return None
         for lora in self.lora_requests:
             if request.model == lora.lora_name:
                 return lora
@@ -185,7 +190,7 @@ def _validate_prompt_and_tokenize(
         request: Union[ChatCompletionRequest, CompletionRequest],
         prompt: Optional[str] = None,
         prompt_ids: Optional[List[int]] = None,
-        truncate_prompt_tokens: Optional[conint(ge=1)] = None
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     ) -> Tuple[List[int], str]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 21c2196eb2739..fefad16700fe3 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -33,7 +33,7 @@ def __init__(
     def optimize(self) -> "LoRALayerWeights":
         """Optimize the LoRA by merging the scaling into lora_b."""
         if self.scaling == 1:
-            return
+            return self
         self.lora_b *= self.scaling
         self.scaling = 1
         return self
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index a19e9461f41f7..d08ae6064aa2a 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -29,8 +29,8 @@ def _multi_split_sample(
     sampled_tokens_size: Tuple[int, int],
     sampled_logprobs_size: Tuple[int, int],
     sample_indices: torch.Tensor,
+    logprobs: torch.Tensor,
     *,
-    logprobs: Optional[torch.Tensor] = None,
     modify_greedy_probs: bool = False,
     save_logprobs: bool = False,
 ):
@@ -167,6 +167,7 @@ def sample(
         sampled_logprobs_size = (0, 0)
         logprobs = probs
 
+    assert logprobs is not None
     if _save_modified_probs:
         sampled_modified_probs_size = sampled_tokens_size
     else:
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 6519781c8a8eb..a5225148d7828 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -108,7 +108,8 @@ def _forward(
             query_pass = query[..., self.rotary_dim:]
             key_pass = key[..., self.rotary_dim:]
 
-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+            positions.device)
         cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
                                      if offsets is not None else positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index 94f438716f8bf..b06a946f34a47 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -222,13 +222,15 @@ def _alibi_scaling_validation(self):
                              f"got {alibi_scaling_type}")
         if (alibi_scaling_factor is not None
                 and not isinstance(alibi_scaling_factor, float)
-                or alibi_scaling_factor <= 1.0):
+                or (alibi_scaling_factor is not None
+                    and alibi_scaling_factor <= 1.0)):
             raise ValueError(
                 f"`alibi_scaling`'s factor field must be a float > 1.0,"
                 f"got {alibi_scaling_factor}")
         if (alibi_dynamic_scaling is not None
                 and not isinstance(alibi_dynamic_scaling, int)
-                or alibi_dynamic_scaling <= 1):
+                or (alibi_dynamic_scaling is not None
+                    and alibi_dynamic_scaling <= 1)):
             raise ValueError(
                 f"`alibi_scaling`'s `train_seq_len` field must be an"
                 f"integer > 1, got {alibi_dynamic_scaling}")
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index a3b979e8fbc13..69380d67f9b94 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -11,7 +11,7 @@
     from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
         RayTokenizerGroupPool)
 else:
-    RayTokenizerGroupPool = None
+    RayTokenizerGroupPool = None  # type: ignore
 
 
 def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index c00b02fdbbbc0..f3cdc00564dbb 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -89,6 +89,7 @@ def encode(self,
         This is blocking.
         """
         self._ensure_queue_initialized()
+        assert self._idle_actors is not None
 
         if self._idle_actors.empty():
             raise RuntimeError("No idle actors available.")
@@ -120,6 +121,7 @@ async def encode_async(
         This is non-blocking.
         """
         self._ensure_queue_initialized()
+        assert self._idle_actors is not None
 
         actor = await self._idle_actors.get()
         try:
diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py
index 79894035cb1f1..76daabc41e0a2 100644
--- a/vllm/transformers_utils/tokenizers/baichuan.py
+++ b/vllm/transformers_utils/tokenizers/baichuan.py
@@ -114,9 +114,9 @@ def _convert_id_to_token(self, index):
         token = self.sp_model.IdToPiece(index)
         return token
 
-    def convert_tokens_to_string(self, tokens):
+    def convert_tokens_to_string(self, tokens: List[str]):
         """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
+        current_sub_tokens: List[str] = []
         out_string = ""
         prev_is_special = False
         for i, token in enumerate(tokens):

From 9e8c3396cc7f3386e54a29d864cbe0ab162b180c Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 23 Apr 2024 00:49:08 -0700
Subject: [PATCH 072/106] [Core] Some simplification of WorkerWrapper changes
 (#4183)

---
 vllm/executor/ray_gpu_executor.py | 90 ++++++++++++++-----------------
 vllm/worker/worker_base.py        |  9 ++--
 2 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index f779b0f8a5113..d0b5e682bb6f7 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -2,6 +2,7 @@
 import os
 import pickle
 from collections import defaultdict
+from itertools import islice, repeat
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
 
 from vllm.engine.ray_utils import RayWorkerWrapper, ray
@@ -136,16 +137,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         VLLM_INSTANCE_ID = get_vllm_instance_id()
 
         # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = []
-        for (node_id, _) in worker_node_and_gpu_ids:
-            all_args_to_update_environment_variables.append([{
-                "CUDA_VISIBLE_DEVICES":
-                ",".join(map(str, node_gpus[node_id])),
-                "VLLM_INSTANCE_ID":
-                VLLM_INSTANCE_ID,
-                "VLLM_TRACE_FUNCTION":
-                os.getenv("VLLM_TRACE_FUNCTION", "0"),
-            }])
+        all_args_to_update_environment_variables = [({
+            "CUDA_VISIBLE_DEVICES":
+            ",".join(map(str, node_gpus[node_id])),
+            "VLLM_INSTANCE_ID":
+            VLLM_INSTANCE_ID,
+            "VLLM_TRACE_FUNCTION":
+            os.getenv("VLLM_TRACE_FUNCTION", "0"),
+        }, ) for (node_id, _) in worker_node_and_gpu_ids]
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
 
@@ -156,10 +155,9 @@ def collect_arg_helper_func(**kwargs):
             # avoid writing `{"name": value}` manually
             return kwargs
 
-        init_worker_all_kwargs = []
-
         # Initialize the actual workers inside worker wrapper.
-        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ):
+        init_worker_all_kwargs = []
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
             local_rank = node_workers[node_id].index(rank)
             init_worker_all_kwargs.append(
                 collect_arg_helper_func(
@@ -265,40 +263,40 @@ def _run_workers(
         self,
         method: str,
         *args,
-        driver_args: Optional[Tuple[Any]] = None,
+        driver_args: Optional[Tuple[Any, ...]] = None,
         driver_kwargs: Optional[Dict[str, Any]] = None,
-        all_args: Optional[List[List[Any]]] = None,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
         use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         use_ray_compiled_dag: bool = False,
         **kwargs,
     ) -> Any:
-        """Runs the given method on all workers.
-        all_args and all_kwargs are used to pass heterogeneous arguments,
-        i.e. different arguments for each worker.
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+
+        - args/kwargs: All workers share the same args/kwargs
+        - args/kwargs and driver_args/driver_kwargs: Driver worker has
+          different args
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
         """
-        if driver_args is None:
-            driver_args = args
-        if driver_kwargs is None:
-            driver_kwargs = kwargs
-
-        # for mypy type checking
-        assert driver_args is not None
-        assert driver_kwargs is not None
-        if all_args is None:
-            all_args = [driver_args] + [args] * len(self.workers)
-        if all_kwargs is None:
-            all_kwargs = [driver_kwargs] + [kwargs] * len(self.workers)
-
-        # for mypy type checking
-        assert all_args is not None
-        assert all_kwargs is not None
 
         if max_concurrent_workers:
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
+        if driver_args is None:
+            driver_args = args if all_args is None else all_args[0]
+        if driver_kwargs is None:
+            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 1, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 1, None)
+
         if use_ray_compiled_dag:
             # Right now, compiled DAG can only accept a single
             # input. TODO(sang): Fix it.
@@ -310,22 +308,17 @@ def _run_workers(
                 worker.execute_method.remote(method, *worker_args,
                                              **worker_kwargs)
                 for (worker, worker_args, worker_kwargs
-                     ) in zip(self.workers, all_args[1:], all_kwargs[1:])
+                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
             ]
 
-        if driver_args is None:
-            driver_args = args
-        if driver_kwargs is None:
-            driver_kwargs = kwargs
-
         # Start the driver worker after all the ray workers.
         if not use_dummy_driver:
             driver_worker_output = self.driver_worker.execute_method(
-                method, *all_args[0], **all_kwargs[0])
+                method, *driver_args, **driver_kwargs)
         else:
             driver_worker_output = ray.get(
                 self.driver_dummy_worker.execute_method.remote(
-                    method, *all_args[0], **all_kwargs[0]))
+                    method, *driver_args, **driver_kwargs))
         # Get the results of the ray workers.
         if self.workers:
             if use_ray_compiled_dag:
@@ -383,6 +376,10 @@ def _check_if_any_actor_is_dead(self):
 
 class RayGPUExecutorAsync(RayGPUExecutor, ExecutorAsyncBase):
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.driver_executor = make_async(self.driver_worker.execute_method)
+
     async def _run_workers_async(
         self,
         method: str,
@@ -399,13 +396,8 @@ async def _run_workers_async(
         if driver_kwargs is None:
             driver_kwargs = kwargs
 
-        # Run the driver worker asynchronously.
-        def helper():
-            return self.driver_worker.execute_method(method, *driver_args,
-                                                     **driver_kwargs)
-
-        driver_executor = make_async(helper)
-        coros.append(driver_executor())
+        coros.append(
+            self.driver_executor(method, *driver_args, **driver_kwargs))
 
         # Run the ray workers asynchronously.
         for worker in self.workers:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 783dff3a43404..bcd04e0f98db6 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -108,7 +108,8 @@ def __init__(self,
         self.worker_class_name = worker_class_name
         self.worker = None
 
-    def update_environment_variables(self, envs: Dict[str, str]) -> None:
+    @staticmethod
+    def update_environment_variables(envs: Dict[str, str]) -> None:
         key = 'CUDA_VISIBLE_DEVICES'
         if key in envs and key in os.environ:
             # overwriting CUDA_VISIBLE_DEVICES is desired behavior
@@ -138,10 +139,8 @@ def init_worker(self, *args, **kwargs):
 
     def execute_method(self, method, *args, **kwargs):
         try:
-            if hasattr(self, method):
-                executor = getattr(self, method)
-            else:
-                executor = getattr(self.worker, method)
+            target = self if self.worker is None else self.worker
+            executor = getattr(target, method)
             return executor(*args, **kwargs)
         except Exception as e:
             # if the driver worker also execute methods,

From 1c429b433226b577b39faff42ed49139f62a216a Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Tue, 23 Apr 2024 17:02:11 +0900
Subject: [PATCH 073/106] [Core] Scheduling optimization 2 (#4280)

---
 tests/core/test_scheduler.py |  3 ++-
 vllm/core/scheduler.py       | 10 ++++++++--
 vllm/sequence.py             |  5 +++++
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index a2511238506b0..ab471d206618b 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -563,7 +563,8 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert len(output.preempted) == 2
     # Verify budgets are updated.
     assert budget.num_batched_tokens == 1
-    assert budget.num_curr_seqs == 1
+    # NOTE: When enable_chunk is False, num_seqs budget is not updated.
+    # assert budget.num_curr_seqs == 1
     # Both should be preempted, not swapped.
     assert output.blocks_to_swap_out == {}
     # Nothing is copied.
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index a302557d81fa8..fe6b29d17d753 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -395,12 +395,12 @@ def _schedule_running(
             # We can have up to 1 running prefill at any given time in running
             # queue, which means we can guarantee chunk size is at least 1.
             assert num_running_tokens != 0
-            num_running_seqs = seq_group.get_max_num_running_seqs()
 
             running_queue.popleft()
             while not self._can_append_slots(seq_group):
                 budget.subtract_num_batched_tokens(seq_group.request_id,
                                                    num_running_tokens)
+                num_running_seqs = seq_group.get_max_num_running_seqs()
                 budget.subtract_num_seqs(seq_group.request_id,
                                          num_running_seqs)
                 if curr_loras is not None and seq_group.lora_int_id > 0:
@@ -439,7 +439,13 @@ def _schedule_running(
                                                token_chunk_size=1))
                 budget.add_num_batched_tokens(seq_group.request_id,
                                               num_running_tokens)
-                budget.add_num_seqs(seq_group.request_id, num_running_seqs)
+                # OPTIMIZATION:  Note that get_max_num_running_seqs is
+                # expensive. For the default scheduling chase where
+                # enable_chunking is False, num_seqs are updated before running
+                # this method, so we don't have to update it again here.
+                if enable_chunking:
+                    num_running_seqs = seq_group.get_max_num_running_seqs()
+                    budget.add_num_seqs(seq_group.request_id, num_running_seqs)
                 if curr_loras is not None and seq_group.lora_int_id > 0:
                     curr_loras.add(seq_group.lora_int_id)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 7dcacab6f2ab6..b296b37a84f15 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -508,6 +508,11 @@ def get_num_uncomputed_tokens(self) -> int:
         return num_uncomputed_tokens
 
     def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
+        # Optimization. We don't need to call get_seqs if we don't need to
+        # filter by states.
+        if status is None:
+            return len(self.seqs_dict)
+
         return len(self.get_seqs(status))
 
     def num_unfinished_seqs(self) -> int:

From dd092ddc4f2b4c72a0054c2433c53a234d9eea75 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 23 Apr 2024 01:02:36 -0700
Subject: [PATCH 074/106] [Speculative decoding 7/9] Speculative decoding
 end-to-end correctness tests. (#3951)

---
 tests/samplers/test_rejection_sampler.py      |   8 +-
 tests/samplers/test_sampler.py                |   3 +-
 tests/spec_decode/e2e/__init__.py             |   0
 tests/spec_decode/e2e/conftest.py             |  45 +-
 tests/spec_decode/e2e/test_compatibility.py   | 169 ++++++
 tests/spec_decode/e2e/test_correctness.py     | 540 ++++++++++++++++--
 tests/spec_decode/test_metrics.py             |   4 +-
 tests/spec_decode/test_multi_step_worker.py   |   4 +-
 tests/spec_decode/test_spec_decode_worker.py  |  40 +-
 tests/spec_decode/utils.py                    |   7 +-
 vllm/config.py                                |  67 ++-
 vllm/engine/arg_utils.py                      |  18 +-
 vllm/engine/llm_engine.py                     |  38 +-
 vllm/engine/metrics.py                        |  23 +-
 vllm/executor/gpu_executor.py                 |   1 +
 .../layers/rejection_sampler.py               |   7 +
 vllm/model_executor/layers/sampler.py         | 182 +++++-
 vllm/spec_decode/batch_expansion.py           |  70 ++-
 vllm/spec_decode/interfaces.py                |   4 +-
 vllm/spec_decode/metrics.py                   |  31 +-
 vllm/spec_decode/multi_step_worker.py         |  29 +-
 vllm/spec_decode/spec_decode_worker.py        |  49 +-
 22 files changed, 1164 insertions(+), 175 deletions(-)
 create mode 100644 tests/spec_decode/e2e/__init__.py
 create mode 100644 tests/spec_decode/e2e/test_compatibility.py

diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 9cfbfadfe98c5..55dd1a8f54cc6 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -96,12 +96,16 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
         bonus_token_ids,
     )
 
+    # Bonus tokens are currently disabled. Verify they're set to -1.
+    # See https://github.com/vllm-project/vllm/issues/4212
+    expected_bonus_token_ids = bonus_token_ids.clone() * 0 - 1
+
     if which_tokens_accepted == "all_tokens_accepted":
         # Expect all tokens to be equal to draft tokens.
         assert torch.equal(output_token_ids[:, :-1], draft_token_ids)
 
         # Expect all bonus tokens to be included.
-        assert torch.equal(output_token_ids[:, -1:], bonus_token_ids)
+        assert torch.equal(output_token_ids[:, -1:], expected_bonus_token_ids)
     elif which_tokens_accepted == "no_tokens_accepted":
         # Expect first token to be equal to recovered tokens.
         assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0])
@@ -111,7 +115,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
                            torch.ones_like(output_token_ids[:, 1:]) * -1)
     elif which_tokens_accepted == "some_tokens_accepted":
         recovered_plus_bonus = torch.cat(
-            (recovered_token_ids, bonus_token_ids), dim=-1)
+            (recovered_token_ids, expected_bonus_token_ids), dim=-1)
         # Assert first rejected token is a recovered token or bonus token.
         assert torch.equal(
             recovered_plus_bonus[torch.arange(0, batch_size),
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 1c08786e69ccd..f1670c539fd00 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -647,7 +647,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     def mock_sample(probs, *args, **kwargs):
         nonlocal sample_probs
         sample_probs = probs
-        return [[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs]
+        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
+                 for prob in probs], None)
 
     with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
         sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
diff --git a/tests/spec_decode/e2e/__init__.py b/tests/spec_decode/e2e/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 1d99cb5d32219..59fb8311fc5b7 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,3 +1,5 @@
+from typing import List, Tuple
+
 import pytest
 
 from tests.conftest import cleanup
@@ -6,28 +8,34 @@
 
 
 @pytest.fixture
-def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                           baseline_llm_kwargs, seed):
-    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+def baseline_llm_generator(request, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           seed):
+    return create_llm_generator("baseline", request, common_llm_kwargs,
+                                per_test_common_llm_kwargs,
                                 baseline_llm_kwargs, seed)
 
 
 @pytest.fixture
-def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs,
                        test_llm_kwargs, seed):
-    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                                test_llm_kwargs, seed)
+    return create_llm_generator("test", request, common_llm_kwargs,
+                                per_test_common_llm_kwargs, test_llm_kwargs,
+                                seed)
 
 
-def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                         distinct_llm_kwargs, seed):
+def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
+                         per_test_common_llm_kwargs, distinct_llm_kwargs,
+                         seed):
     kwargs = {
         **common_llm_kwargs,
         **per_test_common_llm_kwargs,
         **distinct_llm_kwargs,
     }
+    test_name = request.node.name
 
     def generator_inner():
+        print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
         llm = LLM(**kwargs)
 
         set_random_seed(seed)
@@ -36,6 +44,23 @@ def generator_inner():
         del llm
         cleanup()
 
-    for llm in generator_inner():
-        yield llm
+    def generator_outer():
+        for llm in generator_inner():
+            yield llm
+            del llm
+
+    return generator_outer
+
+
+def get_output_from_llm_generator(
+        llm_generator, prompts,
+        sampling_params) -> Tuple[List[str], List[List[int]]]:
+    tokens = []
+    token_ids = []
+    for llm in llm_generator():
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        token_ids = [output.outputs[0].token_ids for output in outputs]
+        tokens = [output.outputs[0].text for output in outputs]
         del llm
+
+    return tokens, token_ids
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
new file mode 100644
index 0000000000000..fde950c14382c
--- /dev/null
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -0,0 +1,169 @@
+import pytest
+
+from vllm import SamplingParams
+
+from .conftest import get_output_from_llm_generator
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            # Expect failure as spec decode not supported by
+            # Ray backend.
+            "worker_use_ray": True,
+        },
+    ])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_ray(test_llm_generator):
+    """Verify that speculative decoding with Ray fails.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    with pytest.raises(AssertionError,
+                       match="Speculative decoding not yet supported for "):
+        get_output_from_llm_generator(test_llm_generator, prompts,
+                                      sampling_params)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "enable_chunked_prefill": True,
+    },
+])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
+    """Verify that speculative decoding with chunked prefill fails.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    with pytest.raises(ValueError,
+                       match="Speculative decoding and chunked prefill"):
+        get_output_from_llm_generator(test_llm_generator, prompts,
+                                      sampling_params)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "meta-llama/Llama-2-7b-chat-hf",
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            # Speculative max model len > overridden max model len should raise.
+            "max_model_len": 128,
+            "speculative_max_model_len": 129,
+        },
+        {
+            # Speculative max model len > draft max model len should raise.
+            # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
+            "speculative_max_model_len": 2048 + 1,
+        },
+        {
+            # Speculative max model len > target max model len should raise.
+            # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
+            "speculative_max_model_len": 4096 + 1,
+        },
+    ])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
+    """Verify that speculative decoding validates speculative_max_model_len.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    with pytest.raises(ValueError, match="cannot be larger than"):
+        get_output_from_llm_generator(test_llm_generator, prompts,
+                                      sampling_params)
+
+
+@pytest.mark.parametrize("common_llm_kwargs", [{
+    "model": "JackFram/llama-68m",
+    "speculative_model": "JackFram/llama-68m",
+    "num_speculative_tokens": 5,
+}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_block_manager_v1(test_llm_generator):
+    """Verify that speculative decoding with block manager v1 fails.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    with pytest.raises(ValueError,
+                       match="Speculative decoding requires usage of the V2"):
+        get_output_from_llm_generator(test_llm_generator, prompts,
+                                      sampling_params)
diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py
index a8ebd66841eb2..0536cc4ecde76 100644
--- a/tests/spec_decode/e2e/test_correctness.py
+++ b/tests/spec_decode/e2e/test_correctness.py
@@ -1,11 +1,42 @@
+"""The tests in this file verify end-to-end speculative decoding correctness.
+
+This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality. This gives us good coverage of temp=0.
+
+For temp>0, we rely on unit tests on the rejection sampler to verify that the
+output distribution is the same with spec decode vs. no spec decode (this would
+be prohibitively expensive to run with a real model).
+
+NOTE: Speculative decoding's distribution equality requires that the measured
+distributions of the target model and proposal model be deterministic given the
+same input. vLLM largely guarantees this.
+
+@cadedaniel has seen cases where the output probabilities of a draft/target
+model change slightly with certain batch sizes or prompts, even with Torch
+determinism flags set. It is unclear if this is a bug in vLLM, due to non-
+determinism in on-device batched operations, a bug in vLLM's spec decode
+implementation, or the "hardware numerics" limitations. Either way, rejection
+sampling ensures the output distribution matches the target model, but it breaks
+greedy-equality tests for those batch sizes/prompts.
+"""
+
 from itertools import cycle
-from typing import List, Tuple
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm import SamplingParams
 
+from .conftest import get_output_from_llm_generator
+
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
@@ -14,9 +45,6 @@
         # Note this is repeated in the test body; to initialize a tokenizer.
         "model": "JackFram/llama-68m",
 
-        # Skip real loading for fast test.
-        "load_format": "dummy",
-
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
 
@@ -31,22 +59,15 @@
             "num_speculative_tokens": 5,
         },
         {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 1,
-        },
-        {
-            # No spec decode.
+            # Verify the detokenizer assertions in the test work when spec
+            # decode is disabled.
         },
     ])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [1])
-# NOTE: We should run more permutations of this test (more BS, more seeds). But
-# because our spec decode generates gibberish token ids, the likelihood of
-# emitting an invalid token combination is nontrivial. This causes divergence in
-# behavior of vLLM detokenization vs. hf tokenizer, for example when two "utf-
-# start" bytes are emitted.
+@pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int):
+def test_spec_decode_e2e_with_detokenization(test_llm_generator,
+                                             batch_size: int):
     """Run generation with speculative decoding on a batch. Verify the engine
     generates the correct number of tokens (via ignore_eos=True), and that the
     detokenization matches HF transformers.
@@ -67,8 +88,6 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int):
         max_tokens=output_len,
         ignore_eos=True,
         temperature=temperature,
-        skip_special_tokens=True,
-        spaces_between_special_tokens=False,
     )
 
     batch_tokens, batch_token_ids = get_output_from_llm_generator(
@@ -77,9 +96,10 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int):
     # Expect a generation for each prompt in the batch.
     assert len(batch_token_ids) == len(prompts)
 
-    # Expect each generation to have expected number of tokens (note
-    # ignore_eos=True).
-    assert all(len(token_ids) == output_len for token_ids in batch_token_ids)
+    # Expect each generation to have expected number of tokens (note ignore_eos
+    # is True).
+    assert [len(token_ids)
+            for token_ids in batch_token_ids] == ([output_len] * batch_size)
 
     # Expect detokenized string to match.
     tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
@@ -92,14 +112,111 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int):
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        # Use a small model for a fast test.
-        "model": "JackFram/llama-68m",
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        # Try two different tiny base models.
+        # Note that one is equal to the draft model, another isn't.
+        {
+            "model": "JackFram/llama-68m",
+        },
+        {
+            "model": "JackFram/llama-160m",
+        },
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use long output len for the small model test.
+        1536,
+    ])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
+        baseline_llm_generator, test_llm_generator, batch_size: int,
+        output_len: int):
+    """Verify greedy equality on a tiny model with batch size of one.
+
+    Since this test is cheaper than other e2e correctness tests, we generate
+    with a higher output_len.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
 
-        # Skip real loading for fast test.
-        "load_format": "dummy",
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        # Try two different tiny base models.
+        # Note that one is equal to the draft model, another isn't.
+        {
+            "model": "JackFram/llama-68m",
+        },
+        {
+            "model": "JackFram/llama-160m",
+        },
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        256,
+    ])
+@pytest.mark.parametrize("batch_size", [64])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
+        baseline_llm_generator, test_llm_generator, batch_size: int,
+        output_len: int):
+    """Verify greedy equality on a tiny model and large batch size.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
 
@@ -109,43 +226,372 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int):
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
+        # Try two different tiny base models.
+        # Note that one is equal to the draft model, another isn't.
         {
-            # Expect failure as spec decode not supported by
-            # Ray backend.
-            "worker_use_ray": True,
+            "model": "JackFram/llama-68m",
+        },
+        {
+            "model": "JackFram/llama-160m",
         },
     ])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize("max_output_len", [
+    256,
+])
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
+        baseline_llm_generator, test_llm_generator, batch_size: int,
+        max_output_len: int):
+    """Verify greedy equality on a tiny model, with a large batch size, and when
+    sampling respects the EOS token.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len,
+                                         force_output_len=False)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # A "real" model (not tiny).
+        "model": "meta-llama/Llama-2-7b-chat-hf",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use decently long output len for a high quality test.
+        256,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
+        baseline_llm_generator, test_llm_generator, batch_size: int,
+        output_len: int):
+    """Verify greedy equality on a "real" model and batch size of 1. This is
+    separate from large BS tests to make identifying the source of bugs easier.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # A "real" model (not tiny).
+        "model": "meta-llama/Llama-2-7b-chat-hf",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        64,
+    ])
 @pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail(test_llm_generator):
-    """Verify that speculative decoding with Ray fails.
+def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
+        baseline_llm_generator, test_llm_generator, batch_size: int,
+        output_len: int):
+    """Verify greedy equality with a "real" model on a nontrivial batch size.
+    This is the closest test to a real production workload.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "model": "JackFram/llama-160m",
+    },
+])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        256,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_with_preemption(
+        baseline_llm_generator, test_llm_generator, batch_size: int,
+        output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        # As of this writing, vLLM only compiles with these 3 block sizes by
+        # default.
+        {
+            "block_size": 8,
+        },
+        {
+            "block_size": 16,
+        },
+        {
+            "block_size": 32,
+        },
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_different_block_size(baseline_llm_generator,
+                                          test_llm_generator, batch_size: int,
+                                          output_len: int):
+    """Verify greedy equality over different block sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+
+            # Artificially limit the draft model max model len; this forces vLLM
+            # to skip speculation once the sequences grow beyond 32-k tokens.
+            "speculative_max_model_len": 32,
+        },
+    ])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # This must be a good bit larger than speculative_max_model_len so that
+        # we can test the case where all seqs are skipped, but still small to
+        # ensure fast test.
+        64,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_skip_speculation(baseline_llm_generator, test_llm_generator,
+                          batch_size: int, output_len: int):
+    """Verify greedy equality when some (or all) sequences skip speculation.
+    We do this by setting the max model len of the draft model to an
+    artificially low value, such that when the sequences grow beyond it, they
+    are skipped in speculative decoding.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": k,
+        }
+        # Try a range of common k, as well as large speculation.
+        for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
+                output_len: int):
+    """Verify that speculative decoding produces exact equality to without spec
+    decode with many different values of k.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+def run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len,
+                                         force_output_len: bool,
+                                         print_tokens: bool = False):
+    """Helper method that compares the outputs of both the baseline LLM and
+    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
+    the same when temperature is zero.
     """
-    output_len = 128
     temperature = 0.0
 
     prompts = [
         "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+        "San Francisco is know for its",
+        "Facebook was created in 2004 by",
+        "Curious George is a",
+        "Python 3.11 brings improvements to its",
     ]
 
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    # If the test requires that we generated max_output_len tokens, then set the
+    # sampling params to ignore eos token.
+    ignore_eos = force_output_len
+
     sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
+        max_tokens=max_output_len,
+        ignore_eos=ignore_eos,
         temperature=temperature,
     )
 
-    with pytest.raises(AssertionError,
-                       match="Speculative decoding not yet supported for "):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
+    spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator(
+        test_llm_generator, prompts, sampling_params)
 
+    (baseline_batch_tokens,
+     baseline_batch_token_ids) = get_output_from_llm_generator(
+         baseline_llm_generator, prompts, sampling_params)
 
-def get_output_from_llm_generator(
-        llm_generator, prompts,
-        sampling_params) -> Tuple[List[str], List[List[int]]]:
-    for llm in llm_generator:
-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
-        token_ids = [output.outputs[0].token_ids for output in outputs]
-        tokens = [output.outputs[0].text for output in outputs]
-        del llm
+    assert len(baseline_batch_token_ids) == len(prompts)
+    assert len(spec_batch_token_ids) == len(prompts)
 
-    return tokens, token_ids
+    for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
+            spec_tokens) in enumerate(
+                zip(baseline_batch_token_ids, baseline_batch_tokens,
+                    spec_batch_token_ids, spec_batch_tokens)):
+        if print_tokens:
+            print(f'{i=} {baseline_tokens=}')
+            print(f'{i=}     {spec_tokens=}')
+        print(f'{i=} {baseline_token_ids=}')
+        print(f'{i=}     {spec_token_ids=}')
+        assert baseline_token_ids == spec_token_ids
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 36e91672069dc..312878804b86e 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -119,7 +119,7 @@ def test_initial_metrics_has_correct_values(has_data: bool):
         num_draft_tokens = 0
     k = 5
 
-    num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens(
+    max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
         num_draft_tokens, k)
 
     rej_sampler = MagicMock()
@@ -153,7 +153,7 @@ def test_initial_metrics_has_correct_values(has_data: bool):
         assert (metrics.draft_acceptance_rate == num_accepted_tokens /
                 num_draft_tokens)
         assert (metrics.system_efficiency == num_emitted_tokens /
-                num_possible_tokens)
+                max_num_emitted_tokens)
     else:
         assert math.isnan(metrics.draft_acceptance_rate)
         assert math.isnan(metrics.system_efficiency)
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index d6edbab579afd..e7aaa1ff4eff8 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -344,8 +344,8 @@ def test_draft_proposals_no_speculations():
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
 
-    assert proposals.proposal_token_ids.shape == torch.Size([0, k])
-    assert proposals.proposal_probs.shape[:-1] == torch.Size([0, k])
+    assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k])
 
     assert proposals.proposal_lens.shape == torch.Size([batch_size])
     assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)]
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 0a3110775e2d6..d24d726c9c0cf 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,4 +1,5 @@
 import random
+from types import SimpleNamespace
 from unittest.mock import MagicMock
 
 import pytest
@@ -62,8 +63,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
     """Verify SpecDecodeWorker calls the target model with correct
     inputs. Everything else is mocked out.
     """
-    draft_worker = mock_worker(cls=MultiStepWorker)
-    target_worker = mock_worker()
+    draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
+    target_worker = mock_worker(use_spec=False)
     rejection_sampler = MagicMock(spec=RejectionSampler)
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
@@ -144,8 +145,10 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
     """
     vocab_size = 32_000
 
-    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
-    target_worker = mock_worker(vocab_size=vocab_size)
+    draft_worker = mock_worker(cls=MultiStepWorker,
+                               vocab_size=vocab_size,
+                               use_spec=False)
+    target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
     rejection_sampler = MagicMock(spec=RejectionSampler)
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
@@ -202,17 +205,16 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
                              num_lookahead_slots=k)
 
     assert len(rejection_sampler.call_args_list) == 1
-    args, _ = rejection_sampler.call_args_list[0]
-    (actual_proposal_scores, actual_bonus_token_ids, actual_proposal_probs,
-     actual_proposal_token_ids) = args
+    _, kwargs = rejection_sampler.call_args_list[0]
+    actual = SimpleNamespace(**kwargs)
 
-    assert torch.equal(actual_bonus_token_ids,
+    assert torch.equal(actual.bonus_token_ids,
                        target_token_ids.reshape(batch_size, k + 1)[:, -1:])
     assert torch.equal(
-        actual_proposal_scores,
+        actual.target_probs,
         target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1])
-    assert torch.equal(actual_proposal_token_ids, proposal_token_ids)
-    assert torch.equal(actual_proposal_probs, proposal_probs)
+    assert torch.equal(actual.draft_token_ids, proposal_token_ids)
+    assert torch.equal(actual.draft_probs, proposal_probs)
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -224,8 +226,10 @@ def test_correctly_formats_output(k: int, batch_size: int):
     """
     vocab_size = 32_000
 
-    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
-    target_worker = mock_worker(vocab_size=vocab_size)
+    draft_worker = mock_worker(cls=MultiStepWorker,
+                               vocab_size=vocab_size,
+                               use_spec=False)
+    target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
     rejection_sampler = MagicMock(spec=RejectionSampler)
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
@@ -336,8 +340,10 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
     """
     vocab_size = 32_000
 
-    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
-    target_worker = mock_worker(vocab_size=vocab_size)
+    draft_worker = mock_worker(cls=MultiStepWorker,
+                               vocab_size=vocab_size,
+                               use_spec=False)
+    target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
     rejection_sampler = MagicMock(spec=RejectionSampler)
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
@@ -500,8 +506,8 @@ def test_init_device():
     """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
     well as other GPU initialization.
     """
-    draft_worker = mock_worker(cls=MultiStepWorker)
-    target_worker = mock_worker()
+    draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
+    target_worker = mock_worker(use_spec=False)
     rejection_sampler = MagicMock(spec=RejectionSampler)
     rejection_sampler.token_id_dtype = torch.int64
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index d04b6029493f4..4f8295d25cf41 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -63,11 +63,14 @@ def create_execute_model_data(
 def mock_worker(cls=None,
                 vocab_size: int = 30_000,
                 max_model_len: int = 2048,
-                rank: int = 0) -> MagicMock:
+                rank: int = 0,
+                use_spec: bool = True) -> MagicMock:
     if cls is None:
         cls = Worker
 
-    worker = MagicMock(spec=cls)
+    spec = cls if use_spec else None
+
+    worker = MagicMock(spec=spec)
     worker.vocab_size = vocab_size
     worker.max_model_len = max_model_len
     worker.rank = rank
diff --git a/vllm/config.py b/vllm/config.py
index 33840f6f1c036..8ccc000f53a25 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -695,6 +695,9 @@ def maybe_create_spec_config(
         target_dtype: str,
         speculative_model: Optional[str],
         num_speculative_tokens: Optional[int],
+        speculative_max_model_len: Optional[int],
+        enable_chunked_prefill: bool,
+        use_v2_block_manager: bool,
     ) -> Optional["SpeculativeConfig"]:
         """Create a SpeculativeConfig if possible, else return None.
 
@@ -712,6 +715,15 @@ def maybe_create_spec_config(
                 model, if provided.
             num_speculative_tokens (Optional[int]): The number of speculative
                 tokens, if provided.
+            speculative_max_model_len (Optional[int]): The maximum model len of
+                the speculative model. Used when testing the ability to skip
+                speculation for some sequences.
+            enable_chunked_prefill (bool): Whether vLLM is configured to use
+                chunked prefill or not. Used for raising an error since its not
+                yet compatible with spec decode.
+            use_v2_block_manager (bool): Whether vLLM is configured to use the
+                v2 block manager or not. Used for raising an error since the v2
+                block manager is required with spec decode.
 
         Returns:
             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
@@ -730,12 +742,21 @@ def maybe_create_spec_config(
         assert (speculative_model is not None
                 and num_speculative_tokens is not None)
 
+        if enable_chunked_prefill:
+            raise ValueError(
+                "Speculative decoding and chunked prefill are "
+                f"currently mutually exclusive ({enable_chunked_prefill=}).")
+
+        if not use_v2_block_manager:
+            raise ValueError(
+                "Speculative decoding requires usage of the V2 "
+                "block manager. Enable it with --use-v2-block-manager.")
+
         # TODO: The user should be able to specify revision/quantization/max
         # model len for the draft model. It is not currently supported.
         draft_revision = None
         draft_code_revision = None
         draft_quantization = None
-        draft_max_model_len = None
 
         draft_model_config = ModelConfig(
             model=speculative_model,
@@ -747,7 +768,7 @@ def maybe_create_spec_config(
             revision=draft_revision,
             code_revision=draft_code_revision,
             tokenizer_revision=target_model_config.tokenizer_revision,
-            max_model_len=draft_max_model_len,
+            max_model_len=None,
             quantization=draft_quantization,
             enforce_eager=target_model_config.enforce_eager,
             max_context_len_to_capture=target_model_config.
@@ -755,6 +776,13 @@ def maybe_create_spec_config(
             max_logprobs=target_model_config.max_logprobs,
         )
 
+        draft_model_config.max_model_len = (
+            SpeculativeConfig._maybe_override_draft_max_model_len(
+                speculative_max_model_len,
+                draft_model_config.max_model_len,
+                target_model_config.max_model_len,
+            ))
+
         draft_parallel_config = (
             SpeculativeConfig.create_draft_parallel_config(
                 target_parallel_config))
@@ -765,6 +793,41 @@ def maybe_create_spec_config(
             num_speculative_tokens,
         )
 
+    @staticmethod
+    def _maybe_override_draft_max_model_len(
+        speculative_max_model_len: Optional[int],
+        draft_max_model_len: int,
+        target_max_model_len: int,
+    ) -> int:
+        """Determine the max sequence len for the draft model. This is usually
+        the draft_max_model_len, but may be the target_max_model_len if it is
+        less than the draft_max_model_len, or may be speculative_max_model_len
+        if it is specified.
+
+        This is necessary so that sequences do not exceed the capacity of the
+        draft model or the target model.
+
+        speculative_max_model_len is mainly used for testing that sequences can
+        skip speculation.
+        """
+
+        if speculative_max_model_len is not None:
+
+            if speculative_max_model_len > draft_max_model_len:
+                raise ValueError(f"{speculative_max_model_len=} cannot be "
+                                 f"larger than {draft_max_model_len=}")
+
+            if speculative_max_model_len > target_max_model_len:
+                raise ValueError(f"{speculative_max_model_len=} cannot be "
+                                 f"larger than {target_max_model_len=}")
+
+            return speculative_max_model_len
+
+        return min(
+            draft_max_model_len,
+            target_max_model_len,
+        )
+
     @staticmethod
     def create_draft_parallel_config(
             target_parallel_config: ParallelConfig) -> ParallelConfig:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5d3d73c0dd60e..330681e863d4e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,6 +77,7 @@ class EngineArgs:
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
     num_speculative_tokens: Optional[int] = None
+    speculative_max_model_len: Optional[int] = None
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -241,7 +242,7 @@ def add_cli_args(
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32, 128],
+                            choices=[8, 16, 32],
                             help='Token block size for contiguous chunks of '
                             'tokens.')
 
@@ -435,17 +436,25 @@ def add_cli_args(
         parser.add_argument(
             '--speculative-model',
             type=str,
-            default=None,
+            default=EngineArgs.speculative_model,
             help=
             'The name of the draft model to be used in speculative decoding.')
 
         parser.add_argument(
             '--num-speculative-tokens',
             type=int,
-            default=None,
+            default=EngineArgs.num_speculative_tokens,
             help='The number of speculative tokens to sample from '
             'the draft model in speculative decoding.')
 
+        parser.add_argument(
+            '--speculative-max-model-len',
+            type=str,
+            default=EngineArgs.speculative_max_model_len,
+            help='The maximum sequence length supported by the '
+            'draft model. Sequences over this length will skip '
+            'speculation.')
+
         parser.add_argument('--model-loader-extra-config',
                             type=str,
                             default=EngineArgs.model_loader_extra_config,
@@ -497,6 +506,9 @@ def create_engine_config(self, ) -> EngineConfig:
             target_dtype=self.dtype,
             speculative_model=self.speculative_model,
             num_speculative_tokens=self.num_speculative_tokens,
+            speculative_max_model_len=self.speculative_max_model_len,
+            enable_chunked_prefill=self.enable_chunked_prefill,
+            use_v2_block_manager=self.use_v2_block_manager,
         )
 
         scheduler_config = SchedulerConfig(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index dcb644b34c234..703cba8f88aa2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -22,7 +22,7 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (MultiModalData, SamplerOutput, Sequence,
-                           SequenceGroup)
+                           SequenceGroup, SequenceStage)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
@@ -482,9 +482,12 @@ def _process_model_outputs(
             seq_group = scheduled_seq_group.seq_group
             seq_group.update_num_computed_tokens(
                 scheduled_seq_group.token_chunk_size)
-            # If uncomputed tokens > 0, it means prefill is chunked.
-            # We don't need to process outputs in that case.
-            if seq_group.get_num_uncomputed_tokens() == 0:
+
+            # If all sequences in the sequence group are in DECODE, then we can
+            # process the output tokens. Otherwise, they are (chunked) prefill
+            # samples and should not be processed.
+            stages = [seq.data._stage for seq in seq_group.seqs_dict.values()]
+            if all(stage == SequenceStage.DECODE for stage in stages):
                 self.output_processor.process_outputs(seq_group, outputs)
 
         # Free the finished sequence groups.
@@ -571,7 +574,8 @@ def step(self) -> List[RequestOutput]:
 
         # Log stats.
         if self.log_stats:
-            self.stat_logger.log(self._get_stats(scheduler_outputs))
+            self.stat_logger.log(
+                self._get_stats(scheduler_outputs, model_output=output))
 
         return request_outputs
 
@@ -580,9 +584,18 @@ def do_log_stats(self) -> None:
         if self.log_stats:
             self.stat_logger.log(self._get_stats(scheduler_outputs=None))
 
-    def _get_stats(self,
-                   scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
-        """Get Stats to be Logged to Prometheus."""
+    def _get_stats(
+            self,
+            scheduler_outputs: Optional[SchedulerOutputs],
+            model_output: Optional[List[SamplerOutput]] = None) -> Stats:
+        """Get Stats to be Logged to Prometheus.
+
+        Args:
+            scheduler_outputs: Optional, used to populate metrics related to
+                the scheduled batch,
+            model_output: Optional, used to emit speculative decoding metrics
+                which are created by the workers.
+        """
         now = time.time()
 
         # KV Cache Usage in %.
@@ -639,6 +652,14 @@ def _get_stats(self,
             time_to_first_tokens = time_last_iters if prompt_run else []
             time_per_output_tokens = [] if prompt_run else time_last_iters
 
+        # Spec decode, if enabled, emits specialized metrics from the worker in
+        # sampler output.
+        if model_output and (model_output[0].spec_decode_worker_metrics
+                             is not None):
+            spec_decode_metrics = model_output[0].spec_decode_worker_metrics
+        else:
+            spec_decode_metrics = None
+
         return Stats(
             now=now,
             num_running=num_running,
@@ -651,6 +672,7 @@ def _get_stats(self,
             time_to_first_tokens=time_to_first_tokens,
             time_per_output_tokens=time_per_output_tokens,
             time_e2e_requests=time_e2e_requests,
+            spec_decode_metrics=spec_decode_metrics,
         )
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 04e27e69ce0f3..25e96f6c7eaf7 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,6 +1,6 @@
 import time
 from dataclasses import dataclass
-from typing import Dict, List, Protocol
+from typing import TYPE_CHECKING, Dict, List, Optional, Protocol
 
 import numpy as np
 from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
@@ -8,6 +8,9 @@
 
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
+
 logger = init_logger(__name__)
 
 disable_created_metrics()
@@ -118,6 +121,8 @@ class Stats:
     time_per_output_tokens: List[float]
     time_e2e_requests: List[float]
 
+    spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+
 
 class SupportsMetricsInfo(Protocol):
 
@@ -235,3 +240,19 @@ def log(self, stats: Stats) -> None:
             self.num_prompt_tokens = []
             self.num_generation_tokens = []
             self.last_local_log = stats.now
+
+            if stats.spec_decode_metrics is not None:
+                logger.info(
+                    self._format_spec_decode_metrics_str(
+                        stats.spec_decode_metrics))
+
+    def _format_spec_decode_metrics_str(
+            self, metrics: "SpecDecodeWorkerMetrics") -> str:
+
+        return ("Speculative metrics: "
+                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
+                f"System efficiency: {metrics.system_efficiency:.3f}, "
+                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
+                f"Number of accepted tokens: {metrics.accepted_tokens}, "
+                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
+                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 77c997f97956e..d413a7d27ff37 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -83,6 +83,7 @@ def _init_spec_worker(self):
             scheduler_config=self.scheduler_config,
             device_config=self.device_config,
             cache_config=self.cache_config,
+            # TODO allow draft-model specific load config.
             load_config=self.load_config,
             local_rank=0,
             rank=0,
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index ecd2bd0fce3a3..5edbbf2c70a49 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -144,6 +144,7 @@ def _batch_modified_rejection_sampling(
         recovered_probs = self._get_recovered_probs(
             target_probs, draft_probs).reshape(batch_size * k, vocab_size)
 
+        # NOTE: the recovered_probs are overwritten by this method.
         recovered_token_ids = _multinomial(recovered_probs,
                                            num_samples=1).reshape(
                                                batch_size, k)
@@ -307,6 +308,12 @@ def _create_output(
         output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1,
                                                       bonus_token_ids, -1)
 
+        # We disable bonus tokens because it causes corrupt KV cache for
+        # proposal methods that require KV cache. We can fix it by "prefilling"
+        # the bonus token in the proposer. The following issue tracks the fix.
+        # https://github.com/vllm-project/vllm/issues/4212
+        output_with_bonus_tokens[:, -1] = -1
+
         # Fill the recovered token ids.
         output.mul_(~after_false_mask).add_(
             recovered_token_ids.mul(after_false_mask))
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 03bf38caebe0e..c4b11cb33a677 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -35,6 +35,14 @@ class Sampler(nn.Module):
     in logits for each token in the input prompt.
     """
 
+    def __init__(self):
+        super().__init__()
+
+        # Whether or not the SamplerOutput should have on-device tensors
+        # containing the sampled token ids and probabilities. This is used by
+        # speculative decoding.
+        self.include_gpu_probs_tensor = False
+
     def forward(
         self,
         logits: torch.Tensor,
@@ -79,13 +87,45 @@ def forward(
         logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
 
         # Sample the next tokens.
-        sample_results = _sample(probs, logprobs, sampling_metadata,
-                                 sampling_tensors)
+        sample_results, maybe_sampled_tokens_tensor = _sample(
+            probs,
+            logprobs,
+            sampling_metadata,
+            sampling_tensors,
+            include_gpu_probs_tensor=self.include_gpu_probs_tensor,
+            modify_greedy_probs=self._should_modify_greedy_probs_inplace,
+        )
+
+        if self.include_gpu_probs_tensor:
+            assert maybe_sampled_tokens_tensor is not None
+            sampled_tokens_tensor = maybe_sampled_tokens_tensor
+            on_device_tensors = (probs, sampled_tokens_tensor)
+        else:
+            on_device_tensors = None
+
         # Get the logprobs query results.
         prompt_logprobs, sample_logprobs = _get_logprobs(
             logprobs, sampling_metadata, sample_results)
-        return _build_sampler_output(sample_results, sampling_metadata,
-                                     prompt_logprobs, sample_logprobs)
+        return _build_sampler_output(sample_results,
+                                     sampling_metadata,
+                                     prompt_logprobs,
+                                     sample_logprobs,
+                                     on_device_tensors=on_device_tensors)
+
+    @property
+    def _should_modify_greedy_probs_inplace(self) -> bool:
+        """Whether or not the sampler should modify the probability distribution
+        of greedily-sampled tokens such that multinomial sampling would sample
+        the greedily-sampled token.
+
+        In other words, if True then we set the probability of the greedily-
+        sampled token to 1.
+
+        This is used by speculative decoding, which requires that the sampling
+        method be encoded into the probability distribution.
+        """
+        # Modify greedy probs if include_gpu_probs_tensor is set.
+        return self.include_gpu_probs_tensor
 
 
 def _get_bin_counts_and_mask(
@@ -359,7 +399,9 @@ def _sample_with_torch(
     probs: torch.Tensor,
     logprobs: torch.Tensor,
     sampling_metadata: SamplingMetadata,
-) -> List[Tuple[List[int], List[int]]]:
+    include_gpu_probs_tensor: bool,
+    modify_greedy_probs: bool,
+) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]:
     categorized_seq_group_ids = {t: [] for t in SamplingType}
     categorized_sample_indices = sampling_metadata.categorized_sample_indices
     for i, seq_group in enumerate(sampling_metadata.seq_groups):
@@ -371,6 +413,15 @@ def _sample_with_torch(
     sample_metadata = {}
     multinomial_samples = {}
 
+    # Create output tensor for sampled token ids.
+    if include_gpu_probs_tensor:
+        sampled_token_ids_tensor = torch.empty(logprobs.shape[0],
+                                               1,
+                                               dtype=torch.long,
+                                               device=logprobs.device)
+    else:
+        sampled_token_ids_tensor = None
+
     # Counterintiutively, having two loops here is actually faster.
     # The first loop can run without waiting on GPU<->CPU sync.
     for sampling_type in SamplingType:
@@ -383,9 +434,25 @@ def _sample_with_torch(
         is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids]
         sample_metadata[sampling_type] = (seq_group_ids, seq_groups,
                                           is_prompts, sample_indices)
+        long_sample_indices = sample_indices.long()
+
         if sampling_type == SamplingType.GREEDY:
-            greedy_samples = torch.argmax(logprobs[sample_indices.long()],
+            greedy_samples = torch.argmax(logprobs[long_sample_indices],
                                           dim=-1)
+
+            if include_gpu_probs_tensor:
+                # Store sampled tokens in output tensor.
+                sampled_token_ids_tensor[
+                    long_sample_indices] = greedy_samples.unsqueeze(-1)
+
+            if modify_greedy_probs:
+                # If required, modify the probabilities such that sampling from
+                # the modified distribution would always sample the argmax
+                # token id.
+                _modify_greedy_probs_inplace(logprobs, probs,
+                                             long_sample_indices,
+                                             greedy_samples)
+
         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
             max_best_of_in_batch = 1
             for seq_group, is_prompt in zip(seq_groups, is_prompts):
@@ -397,15 +464,23 @@ def _sample_with_torch(
                 "seq_groups": seq_groups,
                 "generators": sampling_metadata.generators,
             }
+
             multinomial_samples[sampling_type] = _multinomial(
-                probs[sample_indices.long()], max_best_of_in_batch,
+                probs[long_sample_indices], max_best_of_in_batch,
                 **seeded_args)
+
+            if include_gpu_probs_tensor:
+                # Store sampled tokens in output tensor.
+                sampled_token_ids_tensor[
+                    long_sample_indices] = multinomial_samples[sampling_type]
+
         elif sampling_type == SamplingType.BEAM:
             beam_search_logprobs = logprobs[sample_indices]
         else:
             raise ValueError(f"Unsupported sampling type: {sampling_type}")
 
     # GPU<->CPU sync happens in the loop below.
+    # This also converts the sample output to Python objects.
 
     for sampling_type in SamplingType:
         if sampling_type not in sample_metadata:
@@ -427,7 +502,7 @@ def _sample_with_torch(
         sample_results_dict[i]
         for i in range(len(sampling_metadata.seq_groups))
     ]
-    return sample_results
+    return sample_results, sampled_token_ids_tensor
 
 
 def _sample_with_triton_kernel(
@@ -511,12 +586,17 @@ def _sample_with_triton_kernel(
 
 
 def _sample(
-    probs: torch.Tensor,
-    logprobs: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-    sampling_tensors: SamplingTensors,
-) -> List[Tuple[List[int], List[int]]]:
-    return _sample_with_torch(probs, logprobs, sampling_metadata)
+    probs: torch.Tensor, logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata, sampling_tensors: SamplingTensors,
+    include_gpu_probs_tensor: bool, modify_greedy_probs: bool
+) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]:
+    return _sample_with_torch(
+        probs,
+        logprobs,
+        sampling_metadata,
+        include_gpu_probs_tensor=include_gpu_probs_tensor,
+        modify_greedy_probs=modify_greedy_probs,
+    )
 
     # TODO: Enable once Triton kernel & associated code is faster.
     # return _sample_with_triton_kernel(probs, logprobs, sampling_metadata,
@@ -680,12 +760,73 @@ def _get_logprobs(
     return result_prompt_logprobs, result_sample_logprobs
 
 
+def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
+                                 sample_indices: torch.Tensor,
+                                 greedy_samples: torch.Tensor) -> None:
+    """Modify the probability distributions of the greedily-sampled tokens such
+    that each sampled token has a "probability" of 1.0. This is required by
+    speculative decoding, which depends on the sampling method being encoded
+    within the probability distribution for correctness.
+
+    # Why do we only need to do this for greedy sampling?
+
+    vLLM's sampler performs the following steps for greedy or multinomial
+    (random) sampling:
+        1. Get logits from model.
+        2. Modify logits according to per-sequence sampling parameters.
+            - Multiply by temperature, top-k and top-p masking, penalize tokens
+                according to their frequency, etc.
+        3. Sample a token.
+            - Random sampling simply samples from the modified probability
+                distribution.
+            - Greedy sampling performs `argmax` to obtain the token with the
+                highest likelihood.
+    
+    Ignoring greedy sampling for a moment, we find that the computed probability
+    distribution has the following property: we can sample from it independently
+    and find that the token sampled by the Sampler has a frequency corresponding
+    to how often we see it in our sampling. In other words, for tokens sampled
+    with vLLM's random SamplingType, the computed probability distribution
+    encodes the sampling methodology completely.
+
+    Greedy sampling does not normally have this property. vLLM modifies logits
+    according to sampling params, then performs `argmax`, then returns the
+    sampled token and the computed probability distribution. If we sample from
+    the distribution, we'll find the likelihood of the greedily-sampled token
+    is not always 1.0.
+
+    Since lossless speculative decoding requires that the sampling methodology
+    be encoded within the probability distribution, we are motivated to modify
+    the probability distribution such that the sampled token has probability 1
+    when speculative decoding is used.
+
+    NOTE: Alternatively, we could use an extremely low temperature to achieve
+    greedy sampling using multinomial computation and unite the codepaths. This
+    has implications on the overall design of the sampler, e.g. how to record
+    accurate logprobs for the user, so this improvement is deferred to later.
+    """
+    logprobs[sample_indices, :] = -float('inf')
+    logprobs[sample_indices, greedy_samples] = 0.0
+    probs[sample_indices, :] = 0
+    probs[sample_indices, greedy_samples] = 1.0
+
+
 def _build_sampler_output(
     sample_results: List[Tuple[List[int], List[int]]],
     sampling_metadata: SamplingMetadata,
     prompt_logprobs: List[Optional[PromptLogprobs]],
     sample_logprobs: List[SampleLogprobs],
+    on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor]],
 ) -> SamplerOutput:
+    """Construct Python objects with the output of sampling.
+
+    Args:
+        on_device_tensors: Tuple containing on-device tensors with the
+            probabilities used in sampling and the sampled token ids. This
+            allows post-processing without copies to CPU/serialization, e.g. in
+            speculative decoding rejection sampling.
+    """
+
     sampler_output = []
     for (seq_group, sample_result, group_prompt_logprobs,
          group_sample_logprobs) in zip(sampling_metadata.seq_groups,
@@ -701,4 +842,15 @@ def _build_sampler_output(
                 SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
         sampler_output.append(
             SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
-    return SamplerOutput(outputs=sampler_output)
+
+    # If not specified, store None values in SamplerOutput.
+    if on_device_tensors is not None:
+        sampled_token_probs, sampled_token_ids = on_device_tensors
+    else:
+        sampled_token_probs, sampled_token_ids = (None, None)
+
+    return SamplerOutput(
+        outputs=sampler_output,
+        sampled_token_probs=sampled_token_probs,
+        sampled_token_ids=sampled_token_ids,
+    )
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index bbc5b1778854f..c29b838f854c0 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -6,8 +6,8 @@
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
-from vllm.spec_decode.util import (get_all_seq_ids, maybe_mock_device_tensors,
-                                   nvtx_range, sampler_output_to_torch,
+from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
+                                   sampler_output_to_torch,
                                    split_batch_by_proposal_len)
 from vllm.worker.worker_base import WorkerBase
 
@@ -72,10 +72,16 @@ def score_proposals(
         proposal_lens_list = proposals.proposal_lens.tolist()
         proposal_token_ids_list = proposals.proposal_token_ids.tolist()
 
+        # Filter the list to ignore -1 proposals.
+        proposal_token_ids_list_without_skips = [
+            proposals for proposals in proposal_token_ids_list
+            if -1 not in proposals
+        ]
+
         (spec_indices, non_spec_indices, target_seq_group_metadata_list,
          num_scoring_tokens) = self._expand_batch(
              seq_group_metadata_list=seq_group_metadata_list,
-             proposal_token_ids_list=proposal_token_ids_list,
+             proposal_token_ids_list=proposal_token_ids_list_without_skips,
              proposal_lens_list=proposal_lens_list,
          )
 
@@ -89,7 +95,7 @@ def score_proposals(
         target_sampler_output = target_sampler_output[0]
 
         all_tokens, all_probs = self._contract_batch(
-            original_bs=len(seq_group_metadata_list),
+            contracted_bs=len(seq_group_metadata_list),
             target_sampler_output=target_sampler_output,
             proposals=proposals,
             num_scoring_tokens=num_scoring_tokens,
@@ -128,14 +134,21 @@ def _expand_batch(
             select_proposal_len_zero=True)
 
         target_seq_group_metadata_list = self._create_scoring_model_input(
-            spec_seqs, proposal_token_ids_list)
+            seq_group_metadata_list=spec_seqs,
+            proposal_token_ids=proposal_token_ids_list,
+            # NOTE: We determine the seq ids in the expanded batch using the
+            # full seq_group_metadata_list, instead of only spec_seqs.
+            target_seq_ids_iter=self._create_target_seq_id_iterator(
+                seq_ids=get_all_seq_ids(seq_group_metadata_list)),
+        )
+
         num_scoring_tokens = len(target_seq_group_metadata_list)
         target_seq_group_metadata_list.extend(non_spec_seqs)
 
         return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
                 num_scoring_tokens)
 
-    def _contract_batch(self, original_bs: int,
+    def _contract_batch(self, contracted_bs: int,
                         target_sampler_output: List[SamplerOutput],
                         proposals: SpeculativeProposals,
                         num_scoring_tokens: int, non_spec_indices: List[int],
@@ -144,42 +157,41 @@ def _contract_batch(self, original_bs: int,
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
-        """
-
-        # We mock the device tensors until PR 7/9 is merged (e2e correctness).
-        # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer
-        maybe_mock_device_tensors(
-            sampler_output=target_sampler_output,
-            batch_size=len(non_spec_indices) + num_scoring_tokens,
-            vocab_size=self._vocab_size,
-            device=self._device,
-        )
 
+        contracted_bs is the original batch size, and the batch size that the
+        target_sampler_output will be contracted to.
+        """
         (target_token_ids, target_probs, non_spec_target_token_ids,
          non_spec_target_probs) = self._split_scoring_output(
              target_sampler_output, num_scoring_tokens)
 
         # Map distinct sequences used to score each token
         # of shape [batch_size * k + 1] back to [batch_size, k + 1].
-        batch_size, k = proposals.proposal_token_ids.shape
+        expanded_batch_size, k = proposals.proposal_token_ids.shape
+
+        # The number of tokens in the expanded batch used for speculation is
+        # equal to the total expanded batch size minus the number of samples for
+        # non-speculative sequences.
+        non_spec_expanded_bs, _ = non_spec_target_token_ids.shape
+        spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs
 
         target_token_ids = target_token_ids.squeeze().reshape(
-            batch_size, k + 1)
-        target_probs = target_probs.squeeze().reshape(batch_size, k + 1,
+            spec_expanded_bs, k + 1)
+        target_probs = target_probs.squeeze().reshape(spec_expanded_bs, k + 1,
                                                       self._vocab_size)
 
-        all_tokens = torch.full(size=(original_bs, k + 1),
+        all_tokens = torch.full(size=(contracted_bs, k + 1),
                                 fill_value=-1,
                                 device=self._device,
                                 dtype=torch.long)
-        all_probs = torch.zeros(original_bs,
+        all_probs = torch.zeros(contracted_bs,
                                 k + 1,
                                 self._vocab_size,
                                 device=self._device,
                                 dtype=torch.float32)
 
         if non_spec_indices:
-            all_tokens[non_spec_indices, 0] = non_spec_target_token_ids
+            all_tokens[non_spec_indices, :1] = non_spec_target_token_ids
             all_probs[non_spec_indices, :1, :] = non_spec_target_probs
 
         if spec_indices:
@@ -189,20 +201,22 @@ def _contract_batch(self, original_bs: int,
         return all_tokens, all_probs
 
     def _create_scoring_model_input(
-            self,
-            seq_group_metadata_list: List[SequenceGroupMetadata],
-            proposal_token_ids: List[List[TokenId]],  # shape: [batch_size, k]
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        proposal_token_ids: List[List[TokenId]],  # shape: [batch_size, k]
+        target_seq_ids_iter: Iterator[TargetSeqId],
     ) -> List[SequenceGroupMetadata]:
         """Given the original input sequences and proposed tokens from the draft
         model, create a list of target sequences that can be used for scoring.
+
+        target_seq_ids_iter provides sequence ids for the expanded batch,
+        fulfilling the requirement that no seq id in the expanded batch is equal
+        to the seq id in the original batch.
         """
 
         if not seq_group_metadata_list:
             return []
 
-        target_seq_ids_iter = self._create_target_seq_id_iterator(
-            get_all_seq_ids(seq_group_metadata_list))
-
         target_seq_group_metadata = list(
             chain.from_iterable(
                 self._create_target_seq_group_metadata(
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index f0715120192e5..dd040779922e9 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -24,9 +24,9 @@ class SpeculativeProposals:
 
     def __repr__(self):
         return (f"SpeculativeProposals("
-                f"proposal_token_ids={self.proposal_token_ids.shape}, "
+                f"proposal_token_ids={self.proposal_token_ids}, "
                 f"proposal_probs={self.proposal_probs.shape}, "
-                f"proposal_lens={self.proposal_lens.shape})")
+                f"proposal_lens={self.proposal_lens})")
 
 
 @dataclass
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index d1e72b6640548..ab1d96c558de7 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -147,15 +147,16 @@ def _collect_rejsample_metrics(
         emitted_tokens = self._aggregate_num_emitted_tokens.item()
         draft_tokens = self._aggregate_num_draft_tokens
 
-        num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k)
+        max_num_emitted_tokens = self.get_max_num_emitted_tokens(
+            draft_tokens, k)
 
         if draft_tokens > 0:
             draft_acceptance_rate = accepted_tokens / draft_tokens
         else:
             draft_acceptance_rate = float("nan")
 
-        if num_possible_tokens > 0:
-            system_efficiency = emitted_tokens / num_possible_tokens
+        if max_num_emitted_tokens > 0:
+            system_efficiency = emitted_tokens / max_num_emitted_tokens
         else:
             system_efficiency = float("nan")
 
@@ -169,8 +170,22 @@ def _collect_rejsample_metrics(
         )
 
     @staticmethod
-    def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int:
-        # Divide by k since batch size can be variable.
-        total_num_spec_seqs = draft_tokens / k
-        num_accepted_per_seq_if_all_accepted = k + 1
-        return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted)
+    def get_max_num_emitted_tokens(draft_tokens: int, k: int) -> int:
+        """Calculate the number of emitted tokens, assuming all tokens are
+        accepted.
+
+        This is equal to the number of sequences that have been speculated on,
+        times (speculation len + 1). The +1 comes from the bonus token.
+        """
+        # Determine the number of sequences that have been speculated on. Since
+        # the batch size can be variable, we divide by k.
+        assert draft_tokens % k == 0
+        total_num_spec_seqs = draft_tokens // k
+
+        # A single sequence may emit k accepted tokens and one bonus token in
+        # the best case.
+        num_emitted_per_seq_if_all_accepted = k + 1
+
+        # The max num of emitted tokens is the number of speculated sequences
+        # times the max emitted per seq.
+        return total_num_spec_seqs * num_emitted_per_seq_if_all_accepted
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 8b722476853fa..7cf338bbae5f0 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -6,8 +6,7 @@
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeProposer)
-from vllm.spec_decode.util import (maybe_mock_device_tensors,
-                                   sampler_output_to_torch)
+from vllm.spec_decode.util import sampler_output_to_torch
 from vllm.worker.worker import Worker
 
 
@@ -329,12 +328,15 @@ def _merge_outputs(
         """
         if maybe_sampler_output is None:
             # If no speculative tokens, the sampler output will be None.
-            # In this case we return empty tensors.
-            proposal_tokens = torch.zeros(0,
-                                          max_proposal_len,
-                                          dtype=torch.long,
-                                          device=self._device)
-            proposal_probs = torch.zeros(0,
+            # In this case we return empty proposals.
+            proposal_tokens = torch.full(size=(
+                batch_size,
+                max_proposal_len,
+            ),
+                                         fill_value=-1,
+                                         dtype=torch.long,
+                                         device=self._device)
+            proposal_probs = torch.zeros(batch_size,
                                          max_proposal_len,
                                          self._vocab_size,
                                          dtype=torch.float32,
@@ -345,17 +347,6 @@ def _merge_outputs(
             return proposal_tokens, proposal_probs, proposal_lens_tensor
 
         sampler_output = maybe_sampler_output
-
-        # We mock the device tensors until PR 7/9 is merged (e2e correctness).
-        # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer
-        for step_output in sampler_output:
-            maybe_mock_device_tensors(
-                sampler_output=step_output,
-                batch_size=len(proposal_lens),
-                vocab_size=self._vocab_size,
-                device=self._device,
-            )
-
         proposal_tokens, proposal_probs = sampler_output_to_torch(
             sampler_output)
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 68a2a774ef4b7..2c6642f5a3c81 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -111,6 +111,32 @@ def init_device(self) -> None:
             device=self.device,
             vocab_size=self._vocab_size)
 
+        self._configure_model_sampler_for_spec_decode()
+
+    def _configure_model_sampler_for_spec_decode(self):
+        """Configure model sampler to emit GPU tensors. This allows spec decode
+        to keep data on device without transferring to CPU and serializing,
+        which significantly reduces overhead of rejection sampling.
+
+        NOTE(cade): This breaks abstraction boundaries pretty badly. The better
+        design is to have the "move to CPU and serialize" sampling decision be
+        done outside of the model/sampler; this way the "last-mile" worker
+        object which interfaces with the scheduler can serialize and incur the
+        performance hit as necessary. This allows us to run the worker several
+        iterations in a row without incurring the "move to CPU and serialize"
+        performance penalty.
+
+        Since this requires a large change to vLLM, we defer it to later and
+        temporarily accept this broken abstraction boundary.
+
+        NOTE(cade): This will require a special check if the proposer worker
+        does not have a sampler (e.g. ngram speculation).
+        """
+        (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor
+         ) = True
+        (self.proposer_worker.model_runner.model.sampler.
+         include_gpu_probs_tensor) = True
+
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of cache blocks to use.
 
@@ -286,15 +312,26 @@ def _verify_tokens(
             select_proposal_len_zero=True)
         original_indices = spec_indices + non_spec_indices
 
-        proposal_probs = proposal_scores.probs[spec_indices, :-1]
-        bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:]
+        # Get probabilities of target model, excluding bonus token.
+        proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1]
+
+        # Get non-speculative sampled tokens from target model.
         non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
 
+        # Get bonus tokens from target model.
+        bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:]
+
+        # Get probabilities according to proposal method.
+        proposal_probs = proposals.proposal_probs[spec_indices]
+
+        # Get proposed tokens.
+        proposal_token_ids = proposals.proposal_token_ids[spec_indices]
+
         accepted_token_ids = self.rejection_sampler(
-            proposal_probs,
-            bonus_token_ids,
-            proposals.proposal_probs,
-            proposals.proposal_token_ids,
+            target_probs=proposal_verifier_probs,
+            bonus_token_ids=bonus_token_ids,
+            draft_probs=proposal_probs,
+            draft_token_ids=proposal_token_ids,
         )
 
         # Append output tokens from non-speculative sequences to

From 650eca054fa3e7cfc220a56c1538f7ed4135ba04 Mon Sep 17 00:00:00 2001
From: Jack Gordley <jgordley99@gmail.com>
Date: Tue, 23 Apr 2024 12:06:29 +0100
Subject: [PATCH 075/106] [Bugfix] Fixing max token error message for openai
 compatible server (#4016)

---
 vllm/entrypoints/openai/serving_engine.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index f7073feb2f3d1..e89d497f436b7 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -214,6 +214,12 @@ def _validate_prompt_and_tokenize(
         token_num = len(input_ids)
 
         if request.max_tokens is None:
+            if token_num >= self.max_model_len:
+                raise ValueError(
+                    f"This model's maximum context length is "
+                    f"{self.max_model_len} tokens. However, you requested "
+                    f"{token_num} tokens in the messages, "
+                    f"Please reduce the length of the messages.", )
             request.max_tokens = self.max_model_len - token_num
 
         if token_num + request.max_tokens > self.max_model_len:

From 328da32c692c2c6fcfe2b801e75594bfea17d251 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Wed, 24 Apr 2024 00:28:35 +0800
Subject: [PATCH 076/106] [Bugfix] Add init_cached_hf_modules to
 RayWorkerWrapper (#4286)

---
 vllm/executor/ray_gpu_executor.py | 2 ++
 vllm/worker/worker_base.py        | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index d0b5e682bb6f7..e69f104e7d5a4 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -100,6 +100,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             )(RayWorkerWrapper).remote(
                 worker_module_name="vllm.worker.worker",
                 worker_class_name="Worker",
+                trust_remote_code=self.model_config.trust_remote_code,
             )
 
             worker_ip = ray.get(worker.get_node_ip.remote())
@@ -110,6 +111,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 self.driver_worker = RayWorkerWrapper(
                     worker_module_name="vllm.worker.worker",
                     worker_class_name="Worker",
+                    trust_remote_code=self.model_config.trust_remote_code,
                 )
             else:
                 # Else, added to the list of workers.
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index bcd04e0f98db6..b5dade0a770a0 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -103,10 +103,15 @@ class WorkerWrapperBase:
 
     def __init__(self,
                  worker_module_name=None,
-                 worker_class_name=None) -> None:
+                 worker_class_name=None,
+                 trust_remote_code: bool = False) -> None:
         self.worker_module_name = worker_module_name
         self.worker_class_name = worker_class_name
         self.worker = None
+        if trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
 
     @staticmethod
     def update_environment_variables(envs: Dict[str, str]) -> None:

From c2abce3fd14b7711395fe76d71ee5df5642b2ea9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 23 Apr 2024 09:45:52 -0700
Subject: [PATCH 077/106] [Core][Logging] Add last frame information for better
 debugging (#4278)

---
 vllm/logger.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/vllm/logger.py b/vllm/logger.py
index 046f0e9099a4b..341fc473585d7 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -83,13 +83,27 @@ def _trace_calls(log_path, root_dir, frame, event, arg=None):
             return
         # Log every function call or return
         try:
+            last_frame = frame.f_back
+            if last_frame is not None:
+                last_filename = last_frame.f_code.co_filename
+                last_lineno = last_frame.f_lineno
+                last_func_name = last_frame.f_code.co_name
+            else:
+                # initial frame
+                last_filename = ""
+                last_lineno = 0
+                last_func_name = ""
             with open(log_path, 'a') as f:
                 if event == 'call':
                     f.write(f"{datetime.datetime.now()} Call to"
-                            f" {func_name} in {filename}:{lineno}\n")
+                            f" {func_name} in {filename}:{lineno}"
+                            f" from {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
                 else:
                     f.write(f"{datetime.datetime.now()} Return from"
-                            f" {func_name} in {filename}:{lineno}\n")
+                            f" {func_name} in {filename}:{lineno}"
+                            f" to {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
         except NameError:
             # modules are deleted during shutdown
             pass

From 2a80bcf30f796c444f93516934ed261b9b5f60d2 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 23 Apr 2024 09:51:41 -0700
Subject: [PATCH 078/106] [CI] Add ccache for wheel builds job (#4281)

---
 .github/workflows/publish.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index fc97e33c19af2..4b9fc3d04d872 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -56,6 +56,9 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+
       - name: Set up Linux Env
         if: ${{ runner.os == 'Linux' }}
         run: |

From 9e73227ca8b00b5adda0e085b3c63dcdc5c5c3ad Mon Sep 17 00:00:00 2001
From: James Fleming <jaemz@alum.mit.edu>
Date: Tue, 23 Apr 2024 13:59:33 -0400
Subject: [PATCH 079/106] AQLM CUDA support (#3287)

Co-authored-by: mgoin <michael@neuralmagic.com>
---
 CMakeLists.txt                                |   1 +
 benchmarks/kernels/benchmark_aqlm.py          | 302 ++++++++
 csrc/ops.h                                    |  15 +
 csrc/pybind.cpp                               |   2 +
 csrc/quantization/aqlm/gemm_kernels.cu        | 712 ++++++++++++++++++
 examples/aqlm_example.py                      |  46 ++
 tests/models/test_aqlm.py                     |  95 +++
 vllm/model_executor/layers/linear.py          |  41 +-
 .../layers/quantization/__init__.py           |   2 +
 .../layers/quantization/aqlm.py               | 373 +++++++++
 .../model_executor/layers/quantization/awq.py |   4 +-
 .../layers/quantization/gptq.py               |   3 +-
 .../layers/quantization/marlin.py             |   3 +-
 .../layers/quantization/squeezellm.py         |   4 +-
 14 files changed, 1592 insertions(+), 11 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_aqlm.py
 create mode 100644 csrc/quantization/aqlm/gemm_kernels.cu
 create mode 100644 examples/aqlm_example.py
 create mode 100644 tests/models/test_aqlm.py
 create mode 100644 vllm/model_executor/layers/quantization/aqlm.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1845151181284..b2d0cf3e568b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,6 +173,7 @@ set(VLLM_EXT_SRC
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
+    "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/quantization/marlin/marlin_cuda_kernel.cu"
     "csrc/custom_all_reduce.cu")
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
new file mode 100644
index 0000000000000..9602d20bcbc74
--- /dev/null
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,302 @@
+import argparse
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm._C import ops
+from vllm.model_executor.layers.quantization.aqlm import (
+    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
+    optimized_dequantize_gemm)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def torch_mult(
+        input: torch.Tensor,  #  [..., in_features]
+        weights: torch.Tensor,
+        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+) -> torch.Tensor:
+    output = F.linear(input, weights)
+    return output
+
+
+def dequant_out_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return flattened_output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+            -1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+        -1, weights.shape[1])
+    weights *= b_scales
+    return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+
+    n = parts.sum().item()
+
+    device = torch.device('cuda:0')
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    count = 0
+    for index in range(16):
+        for i in range(8):
+            for book in range(nbooks):
+                codebooks[book, index, 0, i] = count * (10**book)
+            count += 1
+
+    print("codes shape", codes.shape)
+
+    for i in range(16):
+        for book in range(nbooks):
+            codes[0, i, book] = i
+            codes[0, -i, book] = i
+
+    weights = dequantize_weight(codes, codebooks, None)
+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+    print("weights shape:", weights.shape)
+    print("weights2 shape:", weights2.shape)
+
+    print("weights are:", weights)
+    print("weights2 are:", weights2)
+
+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+    print("last 128 weights are", weights[0, -128:])
+    print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+
+    # Add arguments
+    parser.add_argument("--nbooks",
+                        type=int,
+                        default=1,
+                        help="Number of codebooks (default: 1)")
+    parser.add_argument("--bits",
+                        type=int,
+                        default=16,
+                        help="Number of bits per code element (default: 16)")
+    parser.add_argument(
+        "--test",
+        type=bool,
+        default=False,
+        help="Run the decompression/dequant tester rather than benchmarking "
+        "(default: False)")
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Extract values
+    nbooks = args.nbooks
+    bits = args.bits
+
+    if args.test:
+        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+        return
+
+    # Otherwise, benchmark.
+    methods = [
+        ops.aqlm_gemm,
+        dequant_out_scale,
+        generic_dequantize_gemm,
+        optimized_dequantize_gemm,
+        dequant_weight_scale,
+        torch_mult,
+        dequant_no_scale,
+    ]
+
+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+    print(f"writing benchmarks to file {filename}")
+    with open(filename, "w") as f:
+        sys.stdout = f
+
+        print('m | k | n | n parts', end='')
+        for method in methods:
+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
+        print('')
+
+        # These are reasonable prefill sizes.
+        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
+                         (4096, (11008, 11008)), (11008, (4096, )))
+
+        # reasonable ranges for m.
+        for m in [
+                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
+                128, 256, 512, 1024, 1536, 2048, 3072, 4096
+        ]:
+            print(f'{m}', file=sys.__stdout__)
+            for ksp in ksandpartions:
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
+                         methods)
+
+        sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+             methods):
+
+    # I didn't see visible improvements from increasing these, but feel free :)
+    num_warmup_trials = 1
+    num_trials = 1
+
+    num_calls = 100
+
+    # warmup.
+    for method in methods:
+        for _ in range(num_warmup_trials):
+            run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+    n = parts.sum().item()
+    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
+
+    for method in methods:
+        best_time_us = 1e20
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+
+            if kernel_dur_us < best_time_us:
+                best_time_us = kernel_dur_us
+
+        print(f' | {kernel_dur_us:.0f}', end='')
+
+    print('')
+
+
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+               nbooks: int, bits: int, method) -> float:
+
+    n = parts.sum().item()
+
+    device = torch.device('cuda:0')
+
+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+    # for comparison to just a pytorch mult.
+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+
+    if method is torch_mult:
+        for i in range(num_calls):
+            torch_mult(input, weights, scales)
+    else:
+        for i in range(num_calls):
+            method(input, codes, codebooks, scales, parts, None)
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/csrc/ops.h b/csrc/ops.h
index 41ecc1e89371b..a379c910d9cf3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -86,6 +86,21 @@ void gelu_fast(
   torch::Tensor& input);
 
 #ifndef USE_ROCM
+torch::Tensor aqlm_gemm(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const torch::Tensor& codebook_partition_sizes,
+  const std::optional<torch::Tensor>& bias
+);
+
+torch::Tensor aqlm_dequant(
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& codebook_partition_sizes
+);
+
 torch::Tensor awq_gemm(
   torch::Tensor _in_feats,
   torch::Tensor _kernel,
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index de02afc162113..42e92e5382e8e 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -63,6 +63,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
 // Quantization ops
 #ifndef USE_ROCM
+  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
+  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
   ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
   ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
new file mode 100644
index 0000000000000..4415316e1e8cd
--- /dev/null
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -0,0 +1,712 @@
+/*
+ * Modified by Neural Magic
+ * Adapted from https://github.com/Vahe1994/AQLM
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <iostream>
+#include <cstdlib>
+
+
+namespace vllm {
+namespace aqlm {
+
+__global__ void Code1x16MatVec(
+  const int4* __restrict__ A,
+  const int4* __restrict__ B,
+        int4* __restrict__ C,
+  const int4* __restrict__ codebook,
+  const int prob_m,
+  const int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+
+  if (pred)
+  {
+    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size)
+    {
+        codebook += codebook_stride;
+        ++codebook_size;
+    }
+  }
+
+  int b_gl_rd = 0;
+  int c_gl_wr = a_gl_rd;
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+
+  __shared__ int4 sh_b[32 * 9];
+  float res = 0;
+
+  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
+  while (iters--) {
+    // We pad shared memory to avoid bank conflicts during reads
+    __syncthreads();
+    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+      if (b_gl_rd + i < prob_k / 8)
+        sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+    }
+    __syncthreads();
+    b_gl_rd += 32 * 8;
+
+    int b_sh_rd = 9 * (threadIdx.x % 32);
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        uint32_t dec[4];
+        // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
+        // actually help us; this brings > 2x speedup.
+        asm volatile (
+          "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+          : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+          : "l"((void*) &codebook[enc[i]])
+        );
+        half2* a = reinterpret_cast<half2*>(&dec);
+        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
+        half2 res2 = {};
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+          res2 = __hfma2(a[j], b[j], res2);
+        res += __half2float(res2.x) + __half2float(res2.y);
+        b_sh_rd++;
+      }
+      a_gl_rd += 32;
+    }
+  }
+
+  if (pred) {
+    #pragma unroll
+    for (int i = 16; i > 0; i /= 2)
+      res += __shfl_down_sync(0xffffffff, res, i);
+    if (threadIdx.x % 32 == 0)
+      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+  }
+}
+
+__global__ void Code2x8MatVec(
+  const int4* __restrict__ A,
+  const int4* __restrict__ B,
+        int4* __restrict__ C,
+  const int4* __restrict__ codebook,
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
+
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+
+  if (pred)
+  {
+    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size)
+    {
+        codebook += codebook_stride;
+        ++codebook_size;
+    }
+  }
+
+  int b_gl_rd = 0;
+  int c_gl_wr = a_gl_rd;
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+  int lane = threadIdx.x % 8;
+
+  extern __shared__ int4 sh[];
+  int4* sh_b = sh;
+  int4* sh_code = sh_b + 32 * 9;
+  int4* sh_code0 = sh_code;
+  int4* sh_code1 = sh_code + 256 * 8;
+
+  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+    int4 dec = codebook[i];
+    #pragma unroll
+    for (int j = 0; j < 8; j++)
+      sh_code[8 * i + (j + lane) % 8] = dec;
+  }
+  __syncthreads();
+
+  float res = 0;
+
+  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
+  while (iters--) {
+    // We pad shared memory to avoid bank conflicts during reads
+    __syncthreads();
+    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+      if (b_gl_rd + i < prob_k / 8)
+        sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+    }
+    __syncthreads();
+    b_gl_rd += 32 * 8;
+
+    int b_sh_rd = 9 * (threadIdx.x % 32);
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
+        half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
+        half2*  b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
+        half2 res2 = {};
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+          res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
+        res += __half2float(res2.x) + __half2float(res2.y);
+        b_sh_rd++;
+      }
+      a_gl_rd += 32;
+    }
+  }
+
+  if (pred) {
+    #pragma unroll
+    for (int i = 16; i > 0; i /= 2)
+      res += __shfl_down_sync(0xffffffff, res, i);
+    if (threadIdx.x % 32 == 0)
+      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+  }
+}
+
+
+__global__ void Code1x16Dequant(
+  const int4* __restrict__ A,
+        int4* __restrict__ C,
+  const int4* __restrict__ codebook,
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+  const int codebook_stride // as int4
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+
+  if (pred)
+  {
+    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size)
+    {
+        codebook += codebook_stride;
+        ++codebook_size;
+    }
+  }
+
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+
+  int c_gl_stride = prob_k / 8;
+  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
+
+  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+  while (iters--) {
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        int4 chunk;
+        auto dec = reinterpret_cast<uint32_t*>(&chunk);
+        // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
+        // actually help us; this brings > 2x speedup.
+        asm volatile (
+          "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+          : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+          : "l"((void*) &codebook[enc[i]])
+        );
+
+        C[a_gl_rd * 8 + i] = chunk;
+      }
+    }
+    a_gl_rd += 32;
+  }
+}
+
+
+__global__ void Code2x8Dequant(
+  const int4* __restrict__ A,
+        int4* __restrict__ C,
+  const int4* __restrict__ codebook,
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
+  const int codebook_stride // as int4
+) {
+  int a_gl_stride = prob_k / 8 / 8;
+  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  bool pred = a_gl_rd < prob_m;
+
+  if (pred)
+  {
+    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+    auto codebook_size = &codebook_a_sizes.x;
+    while (a_gl_rd >= *codebook_size)
+    {
+        codebook += codebook_stride;
+        ++codebook_size;
+    }
+  }
+
+  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+  int lane = threadIdx.x % 8;
+
+  int c_gl_stride = prob_k / 8;
+  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
+
+  extern __shared__ int4 sh[];
+  int4* sh_code = sh;
+  int4* sh_code0 = sh_code;
+  int4* sh_code1 = sh_code + 256 * 8;
+
+  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+    int4 dec = codebook[i];
+    #pragma unroll
+    for (int j = 0; j < 8; j++)
+      sh_code[8 * i + (j + lane) % 8] = dec;
+  }
+  __syncthreads();
+
+  float res = 0;
+
+  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+  while (iters--) {
+    if (pred && a_gl_rd < a_gl_end) {
+      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
+      #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        int4 chunk;
+        half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
+        half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+          reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
+        C[a_gl_rd * 8 + i] = chunk;
+      }
+    }
+    a_gl_rd += 32;
+  }
+}
+
+inline int ceildiv(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+const int THREAD_M = 16;
+
+void  code1x16_matvec_cuda(
+  const void* __restrict__ A,
+  const void* __restrict__ B,
+        void* __restrict__ C,
+  const void* __restrict__ codebook,
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,
+  const int codebook_stride
+) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code1x16MatVec<<<blocks, threads, 16*32*9, stream>>>(
+    (const int4*) A,
+    (const int4*) B,
+    (int4*) C,
+    (const int4*) codebook,
+    prob_m,
+    prob_k,
+    codebook_a_sizes,
+    codebook_stride
+  );
+}
+
+void  code2x8_matvec_cuda(
+  const void* __restrict__ A,
+  const void* __restrict__ B,
+        void* __restrict__ C,
+  const void* __restrict__ codebook,
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,
+  const int codebook_stride
+) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  int shared = 16 * (2 * 256 * 8 + 32 * 9);
+  cudaFuncSetAttribute(
+    Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
+  );
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code2x8MatVec<<<blocks, threads, shared, stream>>>(
+    (const int4*) A,
+    (const int4*) B,
+    (int4*) C,
+    (const int4*) codebook,
+    prob_m,
+    prob_k,
+    codebook_a_sizes,
+    codebook_stride
+  );
+}
+
+void code1x16_dequant_cuda(
+  const void* __restrict__ A,
+        void* __restrict__ C,
+  const void* __restrict__ codebook,
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+  const int codebook_stride // as int4.
+) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  Code1x16Dequant<<<blocks, threads, 0, stream>>>(
+    (const int4*) A,
+    (int4*) C,
+    (const int4*) codebook,
+    prob_m,
+    prob_k,
+    codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+    codebook_stride // as int4.
+  );
+}
+
+// Dequantizes the code and codebook into weights.
+void  code2x8_dequant_cuda(
+  const void* __restrict__ A,
+        void* __restrict__ C,
+  const void* __restrict__ codebook,
+  int prob_m,
+  int prob_k,
+  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
+  const int codebook_stride // as int4
+) {
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+  int waves = 0;
+  int thread_m;
+  do {
+    waves++;
+    thread_m = ceildiv(prob_m, waves * sms);
+  } while (thread_m > THREAD_M);
+
+  int blocks = ceildiv(prob_m, thread_m);
+  int threads = 32 * thread_m;
+  int shared = 16 * (2 * 256 * 8 + 32 * 9);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+
+  cudaFuncSetAttribute(
+    Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
+  );
+  Code2x8Dequant<<<blocks, threads, shared, stream>>>(
+    (const int4*) A,
+    (int4*) C,
+    (const int4*) codebook,
+    prob_m,
+    prob_k,
+    codebook_a_sizes,
+    codebook_stride
+  );
+}
+
+int codebook_stride(const torch::Tensor& codebooks)
+{
+  return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
+}
+
+void code1x16_matvec(
+  const torch::Tensor& A,
+  const torch::Tensor& B,
+        torch::Tensor& C,
+  const torch::Tensor& codebook,
+  const int4 codebook_a_sizes  // cumulative sizes of A spanning each codebook, at most 3 long.
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  int prob_m = C.size(0);
+  int prob_k = B.size(0);
+
+  code1x16_matvec_cuda(
+    A.data_ptr(),
+    B.data_ptr(),
+    C.data_ptr(),
+    codebook.data_ptr(),
+    prob_m,
+    prob_k,
+    codebook_a_sizes,
+    codebook_stride(codebook)
+  );
+}
+
+torch::Tensor code1x16_matmat(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const int4 codebook_a_sizes,
+  const std::optional<torch::Tensor>& bias) {
+  auto input_sizes = input.sizes();
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty({flat_input.size(0), out_features},
+    torch::TensorOptions()
+      .dtype(input.dtype())
+      .device(input.device())
+  );
+
+  for (int i = 0; i < flat_input.size(0); ++i) {
+    auto input_vec = flat_input.index({i});
+    auto output_vec = flat_output.index({i});
+    code1x16_matvec(
+      codes.squeeze(2),
+      input_vec,
+      output_vec,
+      codebooks,
+      codebook_a_sizes
+    );
+  }
+  flat_output *= scales.flatten().unsqueeze(0);
+
+  if (bias.has_value()) {
+    flat_output += bias->unsqueeze(0);
+  }
+
+  auto output_sizes = input_sizes.vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  auto output = flat_output.reshape(output_sizes);
+  return output;
+}
+
+void code2x8_matvec(
+  const torch::Tensor& A,
+  const torch::Tensor& B,
+        torch::Tensor& C,
+  const torch::Tensor& codebook,
+  const int4 codebook_a_sizes
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  int prob_m = C.size(0);
+  int prob_k = B.size(0);
+  code2x8_matvec_cuda(
+    A.data_ptr(),
+    B.data_ptr(),
+    C.data_ptr(),
+    codebook.data_ptr(),
+    prob_m,
+    prob_k,
+    codebook_a_sizes,
+    2 * codebook_stride(codebook)
+  );
+}
+
+torch::Tensor code2x8_matmat(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const int4 codebook_a_sizes,
+  const std::optional<torch::Tensor>& bias
+) {
+  auto input_sizes = input.sizes();
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty({flat_input.size(0), out_features},
+    torch::TensorOptions()
+      .dtype(input.dtype())
+      .device(input.device())
+  );
+
+  for (int i = 0; i < flat_input.size(0); ++i) {
+    auto input_vec = flat_input.index({i});
+    auto output_vec = flat_output.index({i});
+    code2x8_matvec(
+      codes.squeeze(2),
+      input_vec,
+      output_vec,
+      codebooks,
+      codebook_a_sizes
+    );
+  }
+  flat_output *= scales.flatten().unsqueeze(0);
+  if (bias.has_value()) {
+    flat_output += bias->unsqueeze(0);
+  }
+
+  auto output_sizes = input_sizes.vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  auto output = flat_output.reshape(output_sizes);
+  return output;
+}
+
+// Accumulate the partition sizes.
+int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
+{
+  int4 cumulative_sizes;
+  auto cumulative_size = &cumulative_sizes.x;
+  int i = 0;
+  int last = 0;
+  assert(codebook_partition_sizes.size(0) <= 4);
+  for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)
+  {
+    *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
+    last = *cumulative_size;
+  }
+  // fill in the rest with unreachable.
+  for (; i < 4; ++i, ++cumulative_size)
+  {
+    *cumulative_size = last*10;
+  }
+  return cumulative_sizes;
+}
+
+} // namespace aqlm
+} // namespace vllm
+
+
+torch::Tensor aqlm_gemm(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const torch::Tensor& codebook_partition_sizes,
+  const std::optional<torch::Tensor>& bias
+)
+{
+  int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
+  int const entries = codebooks.size(1);
+
+  if (nbooks == 1 && entries == (1 << 16))
+  { 
+    return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
+  }
+  if (nbooks == 2 && entries == (1 << 8))
+  {
+    return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
+  }
+
+  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
+  return {};
+}
+
+torch::Tensor aqlm_dequant(
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& codebook_partition_sizes
+)
+{
+  int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
+  int const entries = codebooks.size(1);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
+  int rows = codes.size(1);
+  int cols = codes.size(0);
+
+  auto in_features = codes.size(1) * 8;
+  auto out_features = codes.size(0);
+
+  assert(out_features = codebook_partition_sizes.sum().item<int>());
+
+  auto weights = torch::empty({out_features, in_features},
+    torch::TensorOptions()
+      .dtype(codebooks.dtype())
+      .device(codebooks.device())
+  );
+
+  if (nbooks == 1 && entries == (1 << 16))
+  {
+    vllm::aqlm::code1x16_dequant_cuda(
+      codes.data_ptr(),
+      weights.data_ptr(),
+      codebooks.data_ptr(),
+      out_features,
+      in_features,
+      cumulative_sizes,
+      vllm::aqlm::codebook_stride(codebooks));
+
+    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
+    // weights *= scales.index({"...", 0, 0});
+
+     return weights;
+  }
+
+  if (nbooks == 2 && entries == (1 << 8))
+  {
+     vllm::aqlm::code2x8_dequant_cuda(
+        codes.data_ptr(), 
+        weights.data_ptr(), 
+        codebooks.data_ptr(), 
+        out_features,
+        in_features, 
+        cumulative_sizes, 
+        vllm::aqlm::codebook_stride(codebooks));
+
+    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
+    // weights *= scales.index({"...", 0, 0});
+
+     return weights;
+  }
+
+  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
+  return {};
+}
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
new file mode 100644
index 0000000000000..e7c17fa0362ae
--- /dev/null
+++ b/examples/aqlm_example.py
@@ -0,0 +1,46 @@
+import argparse
+
+from vllm import LLM, SamplingParams
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='AQLM examples')
+
+    parser.add_argument('--model',
+                        '-m',
+                        type=str,
+                        default=None,
+                        help='model path, as for HF')
+    parser.add_argument('--choice',
+                        '-c',
+                        type=int,
+                        default=0,
+                        help='known good models by index, [0-4]')
+    parser.add_argument('--tensor_parallel_size',
+                        '-t',
+                        type=int,
+                        default=1,
+                        help='tensor parallel size')
+
+    args = parser.parse_args()
+
+    models = [
+        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
+        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
+        "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
+        "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
+        "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
+    ]
+
+    model = LLM(args.model if args.model is not None else models[args.choice],
+                tensor_parallel_size=args.tensor_parallel_size)
+
+    sampling_params = SamplingParams(max_tokens=100, temperature=0)
+    outputs = model.generate("Hello my name is",
+                             sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
new file mode 100644
index 0000000000000..a7abc011f57d7
--- /dev/null
+++ b/tests/models/test_aqlm.py
@@ -0,0 +1,95 @@
+"""Compare the outputs of a AQLM model between vLLM and HF Transformers
+
+Run `pytest tests/models/test_aqlm.py`.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+aqlm_not_supported = (capability <
+                      QUANTIZATION_METHODS["aqlm"].get_min_capability())
+
+# In this test we hardcode prompts and generations for the model so we don't
+# need to require the AQLM package as a dependency
+example_prompts = [
+    'vLLM is a high-throughput and memory-efficient inference and serving '
+    'engine for LLMs.\n',
+    'Briefly describe the major milestones in the development of artificial '
+    'intelligence from 1950 to 2020.\n',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information.\n',
+    'Describe the basic components of a neural network and how it can be '
+    'trained.\n',
+    'Write a short story about a robot that dreams for the first time.\n',
+    'Analyze the impact of the COVID-19 pandemic on global economic structures '
+    'and future business models.\n',
+    'Explain the cultural significance of the Mona Lisa painting, and how its '
+    'perception might vary in Western versus Eastern societies.\n',
+    "Translate the following English sentence into Japanese, French, and "
+    "Swahili: 'The early bird catches the worm.'\n"
+]
+
+# These ground truth generations were generated using `transformers==4.38.1
+# aqlm==1.1.0 torch==2.2.0`
+# and the below code:
+# ```python
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
+# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
+# torch_dtype="auto", device_map="cuda").cuda()
+# tokenizer = AutoTokenizer.from_pretrained(model_id)
+# outputs = []
+# for prompt in example_prompts:
+#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
+#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
+# print(outputs)
+# ```
+ground_truth_generations = [
+    '\n### Features\n\n- **High-throughput**: v',
+    'The major milestones in the development of artificial intelligence from '
+    '195',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information. The',
+    'Explain the difference between supervised and unsupervised learning.'
+    '\nExplain',
+    'Write a short story about a robot that dreams for the first time. The',
+    'Analyze the impact of the COVID-19 pandemic on global economic',
+    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
+    'The early bird catches the worm.\nThe early bird catches the'
+]
+
+
+@pytest.mark.skipif(aqlm_not_supported,
+                    reason="AQLM is not supported on this GPU type.")
+@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("num_logprobs", [1])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+
+    # loop through the prompts to compare against the ground truth generations
+    for prompt_idx in range(len(example_prompts)):
+        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
+            prompt_idx]
+
+        print("Prompt:          ", repr(example_prompts[prompt_idx]))
+        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
+        print("Output output:   ", repr(vllm_output_str))
+        assert vllm_output_str == ground_truth_generations[prompt_idx]
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index a0136c6b31c92..9cf8bcd825482 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -33,7 +33,7 @@ class LinearMethodBase(ABC):
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
+                       output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
         """Create weights for a linear layer.
@@ -72,9 +72,10 @@ def __init__(self, separate_bias_add: bool = False):
 
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
+                       output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
+        output_size_per_partition = sum(output_partition_sizes)
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),
@@ -129,7 +130,7 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_method.create_weights(self, self.input_size,
-                                          self.output_size, self.input_size,
+                                          [self.output_size], self.input_size,
                                           self.output_size, self.params_dtype)
         if bias:
             self.bias = Parameter(
@@ -163,6 +164,8 @@ class ColumnParallelLinear(torch.nn.Module):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         linear_method: (Maybe quantized) linear method.
+        output_sizes: list of output sizes packed into one output, like for QKV
+                       the list would be size 3.
     """
 
     def __init__(
@@ -174,6 +177,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         linear_method: Optional[LinearMethodBase] = None,
+        output_sizes: Optional[List[int]] = None,
     ):
         super().__init__()
 
@@ -190,10 +194,12 @@ def __init__(
         self.params_dtype = params_dtype
         if linear_method is None:
             linear_method = UnquantizedLinearMethod()
+        if output_sizes is None:
+            output_sizes = [output_size]
         self.linear_method = linear_method
         self.linear_method.create_weights(self,
                                           self.input_size,
-                                          self.output_size_per_partition,
+                                          [x // tp_size for x in output_sizes],
                                           self.input_size,
                                           self.output_size,
                                           self.params_dtype,
@@ -276,14 +282,17 @@ def __init__(
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
         super().__init__(input_size, sum(output_sizes), bias, gather_output,
-                         skip_bias_add, params_dtype, linear_method)
+                         skip_bias_add, params_dtype, linear_method,
+                         self.output_sizes)
 
     def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
                       loaded_shard_id: Optional[int] = None):
+
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        is_metadata = getattr(param, "is_metadata", False)
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -336,6 +345,11 @@ def weight_loader(self,
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
+            shard_offset = loaded_shard_id * shard_size
+            param_data = param_data.narrow(0, shard_offset, shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -415,8 +429,14 @@ def __init__(
         input_size = self.hidden_size
         output_size = (self.num_heads +
                        2 * self.num_kv_heads) * tp_size * self.head_size
+        output_sizes = [
+            self.num_heads * tp_size * self.head_size,
+            self.num_kv_heads * tp_size * self.head_size,
+            self.num_kv_heads * tp_size * self.head_size
+        ]
+
         super().__init__(input_size, output_size, bias, False, skip_bias_add,
-                         params_dtype, linear_method)
+                         params_dtype, linear_method, output_sizes)
 
     def weight_loader(self,
                       param: Parameter,
@@ -424,6 +444,7 @@ def weight_loader(self,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        is_metadata = getattr(param, "is_metadata", False)
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -491,6 +512,12 @@ def weight_loader(self,
             start_idx = shard_id * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
+            shard_index = ["q", "k", "v"].index(loaded_shard_id)
+            param_data = param_data.narrow(0, shard_index * shard_size,
+                                           shard_size)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -567,7 +594,7 @@ def __init__(
         self.linear_method = linear_method
         self.linear_method.create_weights(self,
                                           self.input_size_per_partition,
-                                          self.output_size,
+                                          [self.output_size],
                                           self.input_size,
                                           self.output_size,
                                           self.params_dtype,
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 0344d6e4e3e45..a525add458499 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,5 +1,6 @@
 from typing import Type
 
+from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -9,6 +10,7 @@
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 
 QUANTIZATION_METHODS = {
+    "aqlm": AQLMConfig,
     "awq": AWQConfig,
     "fp8": FP8Config,
     "gptq": GPTQConfig,
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
new file mode 100644
index 0000000000000..6115b1de679ad
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -0,0 +1,373 @@
+# Supports AQLM compression, see https://github.com/Vahe1994/AQLM
+# and https://arxiv.org/pdf/2401.06118.pdf
+
+import math
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm._C import ops
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+def get_int_dtype(nbits: int) -> torch.dtype:
+    if nbits <= 8:
+        return torch.int8
+    if nbits <= 16:
+        return torch.int16
+    if nbits <= 32:
+        return torch.int32
+    if nbits <= 64:
+        return torch.int64
+    raise ValueError(f"No dtype available for {nbits}-bit codebooks")
+
+
+@torch.inference_mode()
+def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
+    return data.to(torch.int64) % (2**nbits)
+
+
+def dequantize_weight(codes: torch.Tensor,
+                      codebooks: torch.Tensor,
+                      scales: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Decode float weights from quantization codes. Differentiable.
+    :param codes: tensor of integer quantization codes, shape 
+        [*dims, num_out_groups, num_in_groups, num_codebooks]
+    :param codebooks: tensor of vectors for each quantization code, 
+        [num_codebooks, codebook_size, out_group_size, in_group_size]
+    :param scales: weight will be multiplied by this factor, must be 
+        broadcastble with 
+        [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
+    :return: reconstructed weight tensor of shape 
+        [*dims, num_in_groups*group_size]
+    """
+    num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
+    num_codebooks, codebook_size, out_group_size, in_group_size = \
+        codebooks.shape
+    out_features = num_out_groups * out_group_size
+    in_features = num_in_groups * in_group_size
+    codebook_offsets = torch.arange(
+        0, num_codebooks * codebook_size, codebook_size,
+        device=codes.device)  # shape: [num_codebooks]
+    reconstructed_weight_flat = F.embedding_bag(
+        codes.flatten(0, -2) + codebook_offsets,
+        codebooks.flatten(0, 1).flatten(-2, -1),
+        mode="sum"
+    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size
+    # * in_group_size]
+
+    reconstructed_weight_groupwise = reconstructed_weight_flat.view(
+        list(codes.shape[:-3]) +
+        [num_out_groups, num_in_groups, out_group_size, in_group_size])
+    if scales is not None:
+        reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
+            scales)
+    return reconstructed_weight_groupwise.swapaxes(
+        -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
+
+
+def dequantize_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    dequantized_weight = dequantize_weight(
+        unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
+        codebooks,
+        scales,
+    )
+    return F.linear(input, dequantized_weight, bias)
+
+
+# Generic dequantization, slow but flexible.
+def generic_dequantize_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    output_shape = input.shape[:-1] + (scales.shape[0], )
+    output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
+    num_outputs = len(output_partition_sizes)
+
+    # break the inputs and codebooks apart then combine the outputs.
+    # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
+    # multiply at the end.
+    num_codebooks = codebooks.shape[0] // num_outputs
+    assert (scales.shape[0] == codes.shape[0])
+    assert (sum(output_partition_sizes) == scales.shape[0])
+    output_offset = 0
+    codebooks_offset = 0
+    for output_size in output_partition_sizes:
+        shard_output = dequantize_gemm(
+            input, codes.narrow(0, output_offset, output_size),
+            codebooks.narrow(0, codebooks_offset, num_codebooks),
+            scales.narrow(0, output_offset, output_size), None
+            if bias is None else bias.narrow(0, output_offset, output_size))
+
+        output_slice = output.narrow(-1, output_offset, output_size)
+        assert (output_slice.shape == shard_output.shape)
+        output_slice.copy_(shard_output)
+        output_offset += output_size
+        codebooks_offset += num_codebooks
+    return output
+
+
+# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
+# at 6 and 9 times faster than the generic version above, respectively.
+def optimized_dequantize_gemm(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        # scaling the output is fastest, so we do that when possible.
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+            -1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+class AQLMConfig(QuantizationConfig):
+    """Config class for AQLM.
+
+    Reference: https://github.com/Vahe1994/AQLM
+    """
+
+    def __init__(
+        self,
+        in_group_size: int,
+        nbits_per_codebook: int,
+        num_codebooks: int,
+        out_group_size: int,
+    ) -> None:
+        self.in_group_size = in_group_size
+        self.nbits_per_codebook = nbits_per_codebook
+        self.num_codebooks = num_codebooks
+        self.out_group_size = out_group_size
+
+        # out_group_size > 1 is untested, and probably won't work as-is.
+        assert (self.out_group_size == 1)
+        self.pack_factor = (self.in_group_size * self.out_group_size)
+
+    def __repr__(self) -> str:
+        return (f"AQLMConfig(in_group_size={self.in_group_size}, "
+                f"nbits_per_codebook={self.nbits_per_codebook}, "
+                f"num_codebooks={self.num_codebooks}, "
+                f"out_group_size={self.out_group_size})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "aqlm"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []  # no extra configs.
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig":
+        in_group_size = cls.get_from_keys(config, ["in_group_size"])
+        nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
+        num_code_books = cls.get_from_keys(config, ["num_codebooks"])
+        out_group_size = cls.get_from_keys(config, ["out_group_size"])
+        return cls(in_group_size, nbits_per_codebook, num_code_books,
+                   out_group_size)
+
+    def get_linear_method(self) -> "AQLMLinearMethod":
+        return AQLMLinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class AQLMLinearMethod(LinearMethodBase):
+    """Linear method for AQLM.
+
+    Args:
+        quant_config: The AQLM quantization config.
+    """
+
+    def __init__(self, quant_config: AQLMConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        del output_size  # Unused.
+        del input_size  # Unused.
+
+        if params_dtype != torch.half:
+            raise ValueError("Only half is currently supported by aqlm")
+        if input_size_per_partition % self.quant_config.in_group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.out_group_size != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        codes = Parameter(
+            torch.empty(
+                # There could actually be two pack factors, one along input and
+                # one along output, but we don't currently support
+                # out_group_size, and only the one along output needs to be
+                # marked with "packed_dim" in order for QKVLinear to work.
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.pack_factor,
+                self.quant_config.num_codebooks,
+                dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(
+            codes,
+            {
+                "input_dim": 1,
+                "output_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+            },
+        )
+
+        codebooks = Parameter(
+            torch.empty(
+                self.quant_config.num_codebooks * len(output_partition_sizes),
+                2**self.quant_config.nbits_per_codebook,
+                self.quant_config.out_group_size,
+                self.quant_config.in_group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            codebooks,
+            {
+                # metadata indicates fixed size concatenated along dim 0
+                "is_metadata":
+                True,
+                "output_partition_sizes":
+                torch.tensor(output_partition_sizes, device='cpu'),
+            },
+        )
+
+        scales = Parameter(
+            torch.empty(
+                (
+                    output_size_per_partition //
+                    self.quant_config.out_group_size,
+                    1,
+                    1,
+                    1,
+                ),
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                "output_dim": 0,
+                "packed_dim": 0,
+                "pack_factor": self.quant_config.out_group_size
+            },
+        )
+
+        layer.register_parameter("codes", codes)
+        set_weight_attrs(codes, extra_weight_attrs)
+        layer.register_parameter("codebooks", codebooks)
+        set_weight_attrs(codebooks, extra_weight_attrs)
+        layer.register_parameter("scales", scales)
+        set_weight_attrs(scales, extra_weight_attrs)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        codebooks = layer.codebooks
+        codes = layer.codes
+        scales = layer.scales
+        output_partition_sizes = getattr(codebooks, "output_partition_sizes",
+                                         None)
+
+        nbooks = codes.shape[2]
+        ingroups = codebooks.shape[3]
+        outgroups = codebooks.shape[2]
+        bits = codebooks.shape[1]
+
+        # We support these formats with dedicated gemm and decompression
+        # kernels.
+        if ingroups == 8 and outgroups == 1 and (
+            (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
+
+            # thresholds determined by timings on an A6000, one GPU
+            use_gemv = math.prod(x.shape[:-1]) <= 6
+
+            return ops.aqlm_gemm(
+                x,
+                codes,
+                codebooks,
+                scales,
+                output_partition_sizes,
+                bias,
+            ) if use_gemv else optimized_dequantize_gemm(
+                x,
+                codes,
+                codebooks,
+                scales,
+                output_partition_sizes,
+                bias,
+            )
+
+        # fall back all unoptimized formats
+        return generic_dequantize_gemm(
+            x,
+            codes,
+            codebooks,
+            scales,
+            output_partition_sizes,
+            bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 98651aed8be0e..4f75134ee1889 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -81,7 +81,7 @@ def __init__(self, quant_config: AWQConfig):
 
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
+                       output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
         if input_size_per_partition % self.quant_config.group_size != 0:
@@ -89,6 +89,8 @@ def create_weights(self, layer: torch.nn.Module,
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
         if output_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index f370b94a210ee..92a5cdb9af928 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -91,7 +91,7 @@ def create_weights(
         self,
         layer: torch.nn.Module,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_partition_sizes: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
@@ -103,6 +103,7 @@ def create_weights(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
+        output_size_per_partition = sum(output_partition_sizes)
         if (output_size_per_partition % self.quant_config.pack_factor.numerator
                 != 0):
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index bf0500f1155a1..00c3c404c2d7a 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -93,7 +93,7 @@ def create_weights(
         self,
         layer: torch.nn.Module,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_partition_sizes: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
@@ -106,6 +106,7 @@ def create_weights(
                 f"The params dtype must be float16, but got {params_dtype}")
 
         # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
         if output_size_per_partition % self.quant_config.min_n_threads != 0:
             raise ValueError(
                 f"Weight output_size_per_partition = "
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 661ff9c55d0d1..cc44447d347b8 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -70,7 +70,7 @@ def __init__(self, quant_config: SqueezeLLMConfig):
 
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
+                       output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
         if input_size_per_partition % self.quant_config.pack_factor != 0:
@@ -78,6 +78,8 @@ def create_weights(self, layer: torch.nn.Module,
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
+
+        output_size_per_partition = sum(output_partition_sizes)
         qweight = Parameter(
             torch.empty(
                 input_size_per_partition // self.quant_config.pack_factor,

From f031047500d9e93df2bc79dd8976cd4f37935003 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 24 Apr 2024 02:19:03 +0800
Subject: [PATCH 080/106] [Bugfix][Frontend] Raise exception when file-like
 chat template fails to be opened (#4292)

---
 tests/async_engine/test_chat_template.py | 19 ++++++++++++++-----
 vllm/entrypoints/openai/serving_chat.py  | 23 +++++++++++++++--------
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 6972ae1dee4a1..8d6ad6706fb0e 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -76,20 +76,29 @@ def test_load_chat_template():
 {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
 
 
-def test_no_load_chat_template():
+def test_no_load_chat_template_filelike():
     # Testing chatml template
     template = "../../examples/does_not_exist"
     tokenizer = MockTokenizer()
 
+    mock_serving_chat = MockServingChat(tokenizer)
+
+    with pytest.raises(ValueError, match="looks like a file path"):
+        OpenAIServingChat._load_chat_template(mock_serving_chat,
+                                              chat_template=template)
+
+
+def test_no_load_chat_template_literallike():
+    # Testing chatml template
+    template = "{{ messages }}"
+    tokenizer = MockTokenizer()
+
     mock_serving_chat = MockServingChat(tokenizer)
     OpenAIServingChat._load_chat_template(mock_serving_chat,
                                           chat_template=template)
     template_content = tokenizer.chat_template
 
-    # Test assertions
-    assert template_content is not None
-    # Hard coded value for template_chatml.jinja
-    assert template_content == """../../examples/does_not_exist"""
+    assert template_content == template
 
 
 @pytest.mark.asyncio
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d502dd0a4eb75..2ff335eb71073 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -319,23 +319,30 @@ async def chat_completion_full_generator(
         return response
 
     def _load_chat_template(self, chat_template):
+        tokenizer = self.tokenizer
+
         if chat_template is not None:
             try:
                 with open(chat_template, "r") as f:
-                    self.tokenizer.chat_template = f.read()
-            except OSError:
+                    tokenizer.chat_template = f.read()
+            except OSError as e:
+                JINJA_CHARS = "{}\n"
+                if not any(c in chat_template for c in JINJA_CHARS):
+                    msg = (f"The supplied chat template ({chat_template}) "
+                           f"looks like a file path, but it failed to be "
+                           f"opened. Reason: {e}")
+                    raise ValueError(msg) from e
+
                 # If opening a file fails, set chat template to be args to
                 # ensure we decode so our escape are interpreted correctly
-                self.tokenizer.chat_template = codecs.decode(
+                tokenizer.chat_template = codecs.decode(
                     chat_template, "unicode_escape")
 
             logger.info(
-                f"Using supplied chat template:\n{self.tokenizer.chat_template}"
-            )
-        elif self.tokenizer.chat_template is not None:
+                f"Using supplied chat template:\n{tokenizer.chat_template}")
+        elif tokenizer.chat_template is not None:
             logger.info(
-                f"Using default chat template:\n{self.tokenizer.chat_template}"
-            )
+                f"Using default chat template:\n{tokenizer.chat_template}")
         else:
             logger.warning(
                 "No chat template provided. Chat API will not work.")

From a8c5a2d9fd9ee963f610bb0752e7bc66f2fc3903 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Tue, 23 Apr 2024 18:18:23 -0700
Subject: [PATCH 081/106] [Kernel] FP8 support for MoE kernel / Mixtral (#4244)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR is the first step towards fixing https://github.com/vllm-project/vllm/pull/3208

It implements dynamic per-tensor scaling (see https://github.com/vllm-project/vllm/pull/4118), so users do not need to compute activation scales on a calibration dataset and they also don't need to convert their model checkpoints. It is enough to specify the `quantization="fp8"` argument. You can try out the PR like this:

```python
from vllm import LLM, SamplingParams

prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1", tensor_parallel_size=2, quantization="fp8")

outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

**Performance**: For this PR, the focus is on making the code clean (while still trying to get reasonable performance), there is a bunch of optimizations that we will submit as a follow up PR that significantly improve the performance (similar to the numbers in https://github.com/vllm-project/vllm/pull/3954). With this PR, the results are as follows:

<img width="725" alt="Screenshot 2024-04-21 at 1 31 50 PM" src="https://github.com/vllm-project/vllm/assets/113316/d8fe1118-07a0-4d4e-8530-37a77d465a03">


**Accuracy**: The accuracy with this PR on MMLU on `mistralai/Mixtral-8x7B-v0.1` is as follows:

```
|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7018|±  |0.0036|
| - humanities     |N/A    |none  |     5|acc   |0.6472|±  |0.0065|
| - other          |N/A    |none  |     5|acc   |0.7673|±  |0.0072|
| - social_sciences|N/A    |none  |     5|acc   |0.8099|±  |0.0070|
| - stem           |N/A    |none  |     5|acc   |0.6131|±  |0.0083|
```
this compares favorably with the fp16 results which are
```
|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7020|±  |0.1313|
| - humanities     |N/A    |none  |     5|acc   |0.6425|±  |0.1349|
| - other          |N/A    |none  |     5|acc   |0.7744|±  |0.1038|
| - social_sciences|N/A    |none  |     5|acc   |0.8131|±  |0.0695|
| - stem           |N/A    |none  |     5|acc   |0.6108|±  |0.1383|
```

Happy hacking!
---
 CMakeLists.txt                                |   1 +
 csrc/ops.h                                    |   5 +
 csrc/pybind.cpp                               |   1 +
 csrc/quantization/fp8/fp8_cuda_kernels.cu     | 103 ++++++++++++
 vllm/_custom_ops.py                           |  10 +-
 ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 ++++++++++++++++++
 .../layers/fused_moe/fused_moe.py             |  93 ++++++++---
 vllm/model_executor/model_loader/loader.py    |   2 +
 vllm/model_executor/model_loader/utils.py     |   1 +
 vllm/model_executor/models/mixtral.py         |  44 +++++-
 10 files changed, 385 insertions(+), 21 deletions(-)
 create mode 100644 csrc/quantization/fp8/fp8_cuda_kernels.cu
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2d0cf3e568b7..4a99985d9abc4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -167,6 +167,7 @@ set(VLLM_EXT_SRC
   "csrc/layernorm_kernels.cu"
   "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
   "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/fp8/fp8_cuda_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu"
   "csrc/pybind.cpp")
diff --git a/csrc/ops.h b/csrc/ops.h
index a379c910d9cf3..ff7a3de1a0a8c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -146,6 +146,11 @@ void gptq_shuffle(
   torch::Tensor q_perm,
   int bit);
 
+void scaled_fp8_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& scale);
+
 void moe_align_block_size(
   torch::Tensor topk_ids,
   int num_experts,
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 42e92e5382e8e..a5b16c5abc3ed 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -73,6 +73,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
+  ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
   ops.def(
     "moe_align_block_size",
     &moe_align_block_size,
diff --git a/csrc/quantization/fp8/fp8_cuda_kernels.cu b/csrc/quantization/fp8/fp8_cuda_kernels.cu
new file mode 100644
index 0000000000000..c3337cede1282
--- /dev/null
+++ b/csrc/quantization/fp8/fp8_cuda_kernels.cu
@@ -0,0 +1,103 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cmath>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+namespace vllm {
+
+__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
+    float old;
+    old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
+         __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+
+    return old;
+}
+
+// Compute the absolute maximum m of the input tensor and store
+// m / float8_e4m3::max() in *scale. Each thread block performs a
+// reduction tree and the memory in scale is atomically updated.
+// So to get the right answer, *scale needs to be initialized to
+// a value <= 0.0 and we need to wait for all thread blocks to
+// finish before consuming *scale.
+template<typename scalar_t>
+__global__ void segmented_max_reduction(
+  float* __restrict__ scale,
+  const scalar_t* __restrict__ input,
+  int64_t num_elems) {
+  __shared__ float cache[1024];
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // First store maximum for all values processes by
+  // the current thread in cache[threadIdx.x]
+  scalar_t tmp = 0.0;
+  while (i < num_elems) {
+    float x = static_cast<float>(input[i]);
+    tmp = max(tmp, fabs(x));
+    i += blockDim.x * gridDim.x;
+  }
+  cache[threadIdx.x] = tmp;
+
+  __syncthreads();
+
+  // Now perform parallel reduction within the thread block
+  int ib = blockDim.x / 2;
+  while (ib != 0) {
+    if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
+        cache[threadIdx.x] = cache[threadIdx.x + ib];
+    }
+    __syncthreads();
+    ib /= 2;
+  }
+  // Finally, since cache[0] contains the maximum for this thread block,
+  // atomically write the max to the target location
+  if (threadIdx.x == 0) {
+    atomicMaxFloat(scale, cache[0] / std::numeric_limits<c10::Float8_e4m3fn>::max());
+  }
+}
+
+template<typename scalar_t>
+__global__ void scaled_fp8_quant_kernel(
+  c10::Float8_e4m3fn* __restrict__ out,
+  const scalar_t* __restrict__ input,
+  const float* __restrict__ scale,
+  int64_t num_elems) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  while (i < num_elems) {
+    out[i] = static_cast<c10::Float8_e4m3fn>(input[i] / *scale);
+    i += blockDim.x * gridDim.x;
+  }
+}
+
+} // namespace vllm
+
+void scaled_fp8_quant(
+  torch::Tensor& out,      // [..., d]
+  torch::Tensor& input,    // [..., d]
+  torch::Tensor& scale)    // [1]
+{
+  int64_t num_tokens = input.numel() / input.size(-1);
+  int64_t num_elems = input.numel();
+  dim3 grid(num_tokens);
+  dim3 block(1024);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(),
+    "scaled_fp8_quant_kernel",
+    [&] {
+      vllm::segmented_max_reduction<scalar_t><<<grid, block, 0, stream>>>(
+        scale.data_ptr<float>(),
+        input.data_ptr<scalar_t>(),
+        num_elems);
+      vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        out.data_ptr<c10::Float8_e4m3fn>(),
+        input.data_ptr<scalar_t>(),
+        scale.data_ptr<float>(),
+        num_elems);
+      });
+}
+
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a0837a20875fe..e4b16ed918d1a 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional
+from typing import Dict, Optional, Tuple
 
 import torch
 
@@ -153,6 +153,14 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                                 size_n, size_k)
 
 
+# fp8
+def scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    vllm_ops.scaled_fp8_quant(output, input, scale)
+    return output, scale
+
+
 # moe
 def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                          block_size: int, sorted_token_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
new file mode 100644
index 0000000000000..2ad07bf79a25c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 377b6588dbf47..ac7c30e2a9727 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -21,6 +21,8 @@ def fused_moe_kernel(
     a_ptr,
     b_ptr,
     c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
     topk_weights_ptr,
     sorted_token_ids_ptr,
     expert_ids_ptr,
@@ -49,6 +51,7 @@ def fused_moe_kernel(
     MUL_ROUTED_WEIGHT: tl.constexpr,
     top_k: tl.constexpr,
     compute_type: tl.constexpr,
+    use_fp8: tl.constexpr,
 ):
     """
     Implements the fused computation for a Mixture of Experts (MOE) using
@@ -111,6 +114,10 @@ def fused_moe_kernel(
     b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
                                                 offs_bn[None, :] * stride_bn)
 
+    if use_fp8:
+        a_scale = tl.load(a_scale_ptr)
+        b_scale = tl.load(b_scale_ptr + off_experts)
+
     # -----------------------------------------------------------
     # Iterate to compute a block of the C matrix.
     # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
@@ -129,7 +136,10 @@ def fused_moe_kernel(
                     mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
                     other=0.0)
         # We accumulate along the K dimension.
-        accumulator += tl.dot(a, b)
+        if use_fp8:
+            accumulator = tl.dot(a, b, acc=accumulator)
+        else:
+            accumulator += tl.dot(a, b)
         # Advance the ptrs to the next K block.
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
@@ -140,7 +150,10 @@ def fused_moe_kernel(
                              other=0)
         accumulator = accumulator * moe_weight[:, None]
 
-    accumulator = accumulator.to(compute_type)
+    if use_fp8:
+        accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+    else:
+        accumulator = accumulator.to(compute_type)
     # -----------------------------------------------------------
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -207,15 +220,24 @@ def moe_align_block_size(
 
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                            B_scale: torch.Tensor, topk_weights: torch.Tensor,
+                            topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
                             mul_routed_weight: bool, top_k: int,
-                            config: Dict[str, Any]) -> None:
+                            config: Dict[str, Any], compute_type: tl.dtype,
+                            use_fp8: bool) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
+    if not use_fp8:
+        A_scale = None
+        assert B_scale is None
+    else:
+        A, A_scale = ops.scaled_fp8_quant(A)
+        assert B_scale is not None
+
     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
         'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
 
@@ -223,6 +245,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         A,
         B,
         C,
+        A_scale,
+        B_scale,
         topk_weights,
         sorted_token_ids,
         expert_ids,
@@ -240,18 +264,21 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         C.stride(2),
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         top_k=top_k,
-        compute_type=tl.bfloat16 if A.dtype == torch.bfloat16 else tl.float16,
+        compute_type=compute_type,
+        use_fp8=use_fp8,
         **config,
     )
 
 
-def get_config_file_name(E: int, N: int) -> str:
+def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
     device_name = torch.cuda.get_device_name().replace(" ", "_")
-    return f"E={E},N={N},device_name={device_name}.json"
+    dtype_selector = "" if not dtype else f",dtype={dtype}"
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}.json"
 
 
 @functools.lru_cache
-def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
+def get_moe_configs(E: int, N: int,
+                    dtype: Optional[str]) -> Optional[Dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
 
@@ -263,7 +290,7 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
 
     # First look up if an optimized configuration is available in the configs
     # directory
-    json_file_name = get_config_file_name(E, N)
+    json_file_name = get_config_file_name(E, N, dtype)
 
     config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
@@ -288,6 +315,9 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -305,6 +335,12 @@ def fused_moe(
         Defaults to False.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -358,7 +394,8 @@ def fused_moe(
         config = override_config
     else:
         # First try to load optimal config from the file
-        configs = get_moe_configs(E, w2.shape[2])
+        configs = get_moe_configs(E, w2.shape[2],
+                                  "float8" if use_fp8 else None)
 
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -394,17 +431,37 @@ def fused_moe(
     sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
         topk_ids, config['BLOCK_SIZE_M'], E)
 
-    invoke_fused_moe_kernel(hidden_states, w1, intermediate_cache1,
-                            topk_weights, topk_ids, sorted_token_ids,
-                            expert_ids, num_tokens_post_padded, False,
-                            topk_ids.shape[1], config)
+    invoke_fused_moe_kernel(hidden_states,
+                            w1,
+                            intermediate_cache1,
+                            w1_scale,
+                            topk_weights,
+                            topk_ids,
+                            sorted_token_ids,
+                            expert_ids,
+                            num_tokens_post_padded,
+                            False,
+                            topk_ids.shape[1],
+                            config,
+                            compute_type=tl.float16,
+                            use_fp8=use_fp8)
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
-    invoke_fused_moe_kernel(intermediate_cache2, w2, intermediate_cache3,
-                            topk_weights, topk_ids, sorted_token_ids,
-                            expert_ids, num_tokens_post_padded, True, 1,
-                            config)
+    invoke_fused_moe_kernel(intermediate_cache2,
+                            w2,
+                            intermediate_cache3,
+                            w2_scale,
+                            topk_weights,
+                            topk_ids,
+                            sorted_token_ids,
+                            expert_ids,
+                            num_tokens_post_padded,
+                            True,
+                            1,
+                            config,
+                            compute_type=tl.float16,
+                            use_fp8=use_fp8)
 
     if inplace:
         return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 64cd186506bdb..f75c35a69d4a9 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -232,6 +232,8 @@ def load_model(self, *, model_config: ModelConfig,
                 linear_method = getattr(module, "linear_method", None)
                 if linear_method is not None:
                     linear_method.process_weights_after_loading(module)
+                if hasattr(module, "process_weights_after_loading"):
+                    module.process_weights_after_loading()
         return model.eval()
 
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index a0a3b2784614d..f7e0f56c1a46e 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -24,6 +24,7 @@ def get_model_architecture(
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     if (model_config.quantization is not None
+            and model_config.quantization != "fp8"
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 4d1755f2bbe63..a33b795d7088e 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -39,6 +39,8 @@
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod,
+                                                         per_tensor_quantize)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -47,6 +49,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import SamplerOutput
+from vllm.utils import print_warning_once
 
 
 class MixtralMoE(nn.Module):
@@ -66,6 +69,7 @@ def __init__(
         intermediate_size: int,
         params_dtype: Optional[torch.dtype] = None,
         tp_size: Optional[int] = None,
+        linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
         self.tp_size = tp_size or get_tensor_model_parallel_world_size()
@@ -73,6 +77,9 @@ def __init__(
         self.top_k = top_k
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size // self.tp_size
+        # FIXME(pcmoritz): Make this more general to support different
+        # quantization schemes
+        self.use_fp8 = isinstance(linear_method, Fp8LinearMethod)
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -97,6 +104,16 @@ def __init__(
                         device="cuda",
                         dtype=self.params_dtype))
 
+        # Scaling factors for FP8 weights
+        self.ws_scale = nn.Parameter(
+            torch.ones(
+                self.num_total_experts, device="cuda", dtype=torch.float32),
+            requires_grad=False) if self.use_fp8 else None
+        self.w2s_scale = nn.Parameter(
+            torch.ones(
+                self.num_total_experts, device="cuda", dtype=torch.float32),
+            requires_grad=False) if self.use_fp8 else None
+
         set_weight_attrs(self.ws, {
             "weight_loader": self.weight_loader,
         })
@@ -118,6 +135,18 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
 
+    def process_weights_after_loading(self):
+        if self.use_fp8:
+            ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
+            w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
+            for expert in range(self.num_total_experts):
+                ws[expert, :, :], self.ws_scale[expert] = per_tensor_quantize(
+                    self.ws.data[expert, :, :])
+                w2s[expert, :, :], self.w2s_scale[
+                    expert] = per_tensor_quantize(self.w2s.data[expert, :, :])
+            self.ws = nn.Parameter(ws, requires_grad=False)
+            self.w2s = nn.Parameter(w2s, requires_grad=False)
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
@@ -129,7 +158,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         router_logits,
                                         self.top_k,
                                         renormalize=True,
-                                        inplace=True)
+                                        inplace=True,
+                                        use_fp8=self.use_fp8,
+                                        w1_scale=self.ws_scale,
+                                        w2_scale=self.w2s_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -171,6 +203,13 @@ def __init__(self,
         self.rope_theta = rope_theta
         self.sliding_window = sliding_window
 
+        if isinstance(linear_method, Fp8LinearMethod):
+            print_warning_once(
+                "For Mixtral FP8 quantization, we currently do not quantize "
+                "the attention layers until their FP8 performance is improved."
+            )
+            linear_method = None
+
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
             self.head_dim,
@@ -238,7 +277,8 @@ def __init__(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size)
+            intermediate_size=config.intermediate_size,
+            linear_method=linear_method)
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,

From e5b0dc8d224f880e1a08facd90d395798d7846ca Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Tue, 23 Apr 2024 21:26:33 -0400
Subject: [PATCH 082/106] [BUG] fixed fp8 conflict with aqlm (#4307)

Fixes fp8 iterface which broke in AQLM merge.
---
 .buildkite/test-pipeline.yaml                  |  3 +++
 vllm/model_executor/layers/linear.py           | 16 +++++++++++++---
 vllm/model_executor/layers/quantization/fp8.py |  3 ++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f7c1569696249..11cda053260ec 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -96,6 +96,9 @@ steps:
 - label: Metrics Test
   command: pytest -v -s metrics
 
+- label: Quantization Test
+  command: pytest -v -s quantization
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   commands:
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 9cf8bcd825482..b46cc52df9bf9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -36,9 +36,19 @@ def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
-        """Create weights for a linear layer.
-
-        The weights will be set as attributes of the layer."""
+        """Create weights for a linear layer. 
+           The weights will be set as attributes of the layer.
+        
+        Args:
+            layer: The layer that is using the LinearMethodBase factory.
+            input_size_per_partition: Size of the weight input dim on rank X.
+            output_partition_sizes: Sizes of the output dim of each logical 
+                weight on rank X. E.g., output_partition_sizes for QKVLinear
+                is a list contains the width of Wq, Wk, Wv on rank X.
+            input_size: Size of the input dim of the weight across all ranks.
+            output_size: Size of the output dim of the weight across all ranks.
+            params_dtype: Datatype of the parameters.
+        """
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8df82e0e18edd..01e494c870e71 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -64,12 +64,13 @@ def create_weights(
         self,
         layer: torch.nn.Module,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_partition_sizes: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        output_size_per_partition = sum(output_partition_sizes)
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),

From 16883fd275a0e7ae7501ea1fbf0c87e4444526df Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 23 Apr 2024 18:32:19 -0700
Subject: [PATCH 083/106] [Core][Distributed] use cpu/gloo to initialize pynccl
 (#4248)

---
 tests/distributed/test_pynccl.py              |  15 ++-
 .../device_communicators/pynccl.py            | 122 ++++++++++--------
 .../device_communicators/pynccl_utils.py      |  12 +-
 vllm/distributed/parallel_state.py            |   6 +
 vllm/worker/worker.py                         |   9 +-
 5 files changed, 93 insertions(+), 71 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index d58f621d36b86..6d7d4a5806bd0 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -5,6 +5,7 @@
 
 from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
                                                           ncclGetUniqueId)
+from vllm.distributed.parallel_state import init_distributed_environment
 from vllm.utils import update_environment_variables
 
 
@@ -26,19 +27,23 @@ def distributed_run(fn, world_size):
     for p in processes:
         p.join()
 
+    for p in processes:
+        assert p.exitcode == 0
+
 
-def update_env(fn):
+def worker_fn_wrapper(fn):
     # `multiprocessing.Process` cannot accept environment variables directly
     # so we need to pass the environment variables as arguments
     # and update the environment variables in the function
-    def wrapper(env):
+    def wrapped_fn(env):
         update_environment_variables(env)
+        init_distributed_environment()
         fn()
 
-    return wrapper
+    return wrapped_fn
 
 
-@update_env
+@worker_fn_wrapper
 def worker_fn():
     comm = NCCLCommunicator()
     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
@@ -53,7 +58,7 @@ def test_pynccl():
     distributed_run(worker_fn, 2)
 
 
-@update_env
+@worker_fn_wrapper
 def worker_fn_with_cudagraph():
     with torch.no_grad():
         graph = torch.cuda.CUDAGraph()
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 0707afe922f40..fcedf0fed34cb 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -20,14 +20,15 @@
 # variable in the code.
 
 import ctypes
-import datetime
 import platform
+from typing import Optional, Union
 
 # ===================== import region =====================
 import torch
 import torch.distributed as dist
-from torch.distributed import ReduceOp
+from torch.distributed import ProcessGroup, ReduceOp
 
+from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
 from vllm.logger import init_logger
 from vllm.utils import find_nccl_library, nccl_integrity_check
 
@@ -59,6 +60,18 @@
 
 ncclResult_t = ctypes.c_int
 
+_c_ncclGetErrorString = nccl.ncclGetErrorString
+_c_ncclGetErrorString.restype = ctypes.c_char_p
+_c_ncclGetErrorString.argtypes = [ncclResult_t]
+
+
+def NCCL_CHECK(result: ncclResult_t) -> None:
+    if result != 0:
+        error_str = _c_ncclGetErrorString(result)
+        error_str = error_str.decode("utf-8")
+        raise RuntimeError(f"NCCL error: {error_str}")
+
+
 # equivalent to c declaration:
 # ncclResult_t  ncclGetVersion(int *version);
 _c_ncclGetVersion = nccl.ncclGetVersion
@@ -68,8 +81,7 @@
 
 def ncclGetVersion() -> str:
     version = ctypes.c_int()
-    result = _c_ncclGetVersion(ctypes.byref(version))
-    assert result == 0
+    NCCL_CHECK(_c_ncclGetVersion(ctypes.byref(version)))
     # something like 21903 --> "2.19.3"
     version_str = str(version.value)
     major = version_str[0].lstrip("0")
@@ -91,8 +103,7 @@ class NcclUniqueId(ctypes.Structure):
 
 def ncclGetUniqueId() -> NcclUniqueId:
     unique_id = NcclUniqueId()
-    result = _c_ncclGetUniqueId(ctypes.byref(unique_id))
-    assert result == 0
+    NCCL_CHECK(_c_ncclGetUniqueId(ctypes.byref(unique_id)))
     return unique_id
 
 
@@ -199,66 +210,75 @@ class NCCLCommunicator:
 
     def __init__(
         self,
-        backend=None,
-        init_method=None,
-        timeout=datetime.timedelta(seconds=10),
-        world_size: int = -1,
-        rank: int = -1,
-        store=None,
-        group_name: str = "",
-        pg_options=None,
-        local_rank: int = -1,
+        group: Optional[ProcessGroup] = None,
+        device: Optional[Union[int, str, torch.device]] = None,
     ):
-        if not dist.is_initialized():
-            backend = backend or "nccl"
-            assert backend == 'nccl', (
-                "only use nccl backend for starting the NCCL communicator")
-            dist.init_process_group(backend=backend,
-                                    init_method=init_method,
-                                    timeout=timeout,
-                                    world_size=world_size,
-                                    rank=rank,
-                                    store=store,
-                                    group_name=group_name,
-                                    pg_options=pg_options)
-        self.rank = dist.get_rank()
-        self.world_size = dist.get_world_size()
-        if local_rank == -1:
-            local_rank = self.rank
-        self.local_rank = local_rank
-        # don't use these args, as they can be -1
-        # use `self.rank`, `self.local_rank` and `self.world_size` instead
-        del world_size, rank, local_rank
-        torch.cuda.set_device(self.local_rank)
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the NCCLCommunicator to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device.
+        """
+        assert dist.is_initialized()
+        group = get_cpu_world_group() if group is None else group
+        assert dist.get_backend(group) != dist.Backend.NCCL, (
+            "NCCLCommunicator should be attached to a non-NCCL group.")
+        self.group = group
+        self.rank = dist.get_rank(group)
+        self.world_size = dist.get_world_size(group)
         if self.rank == 0:
             self.unique_id = ncclGetUniqueId()
         else:
             self.unique_id = NcclUniqueId()
-        tensor = torch.ByteTensor(list(self.unique_id.internal)).cuda(
-            self.local_rank)
-        dist.broadcast(tensor, src=0)
-        byte_list = tensor.cpu().tolist()
+        tensor = torch.ByteTensor(list(self.unique_id.internal))
+        dist.broadcast(tensor, src=0, group=group)
+        byte_list = tensor.tolist()
         for i, byte in enumerate(byte_list):
             self.unique_id.internal[i] = byte
         self.comm = ctypes.c_void_p()
-        result = _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size,
-                                     self.unique_id, self.rank)
-        assert result == 0
-        self.stream = torch.cuda.Stream(device=f"cuda:{self.local_rank}")
+        if device is None:
+            local_rank = get_local_rank()
+            device = torch.device(f"cuda:{local_rank}")
+        elif isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+        # nccl communicator and stream will use this device
+        current_device = torch.cuda.current_device()
+        try:
+            torch.cuda.set_device(device)
+            NCCL_CHECK(
+                _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size,
+                                    self.unique_id, self.rank))
+            self.stream = torch.cuda.Stream()
+        finally:
+            torch.cuda.set_device(current_device)
 
     def all_reduce(self,
                    tensor: torch.Tensor,
                    op: ReduceOp = ReduceOp.SUM,
                    stream=None):
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
-                                  ctypes.c_void_p(tensor.data_ptr()),
-                                  tensor.numel(),
-                                  ncclDataTypeEnum.from_torch(tensor.dtype),
-                                  ncclRedOpTypeEnum.from_torch(op), self.comm,
-                                  ctypes.c_void_p(stream.cuda_stream))
-        assert result == 0
+        NCCL_CHECK(
+            _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
+                             ctypes.c_void_p(tensor.data_ptr()),
+                             tensor.numel(),
+                             ncclDataTypeEnum.from_torch(tensor.dtype),
+                             ncclRedOpTypeEnum.from_torch(op), self.comm,
+                             ctypes.c_void_p(stream.cuda_stream)))
 
     def __del__(self):
         # `dist` module might have been already destroyed
diff --git a/vllm/distributed/device_communicators/pynccl_utils.py b/vllm/distributed/device_communicators/pynccl_utils.py
index 916dc814af7eb..a717fddb695ba 100644
--- a/vllm/distributed/device_communicators/pynccl_utils.py
+++ b/vllm/distributed/device_communicators/pynccl_utils.py
@@ -2,7 +2,7 @@
 from typing import Optional
 
 import torch
-from torch.distributed import ReduceOp
+from torch.distributed import ProcessGroup, ReduceOp
 
 from vllm.logger import init_logger
 
@@ -37,17 +37,11 @@ def set_pynccl_stream(stream: torch.cuda.Stream):
         pass
 
 
-def init_process_group(world_size: int,
-                       rank: int,
-                       init_method: str,
-                       local_rank: int = -1) -> None:
+def init_process_group(group: Optional[ProcessGroup] = None) -> None:
     assert not is_initialized()
     global comm
     logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
-    comm = NCCLCommunicator(init_method=init_method,
-                            world_size=world_size,
-                            local_rank=local_rank,
-                            rank=rank)
+    comm = NCCLCommunicator(group=group)
 
 
 def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e2473736375e0..515f2212511b7 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -4,6 +4,7 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 """Tensor and pipeline parallel groups."""
 import contextlib
+import os
 from typing import Optional
 
 import torch
@@ -73,6 +74,11 @@ def init_distributed_environment(
         ranks = list(range(torch.distributed.get_world_size()))
         _CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks,
                                                        backend="gloo")
+        # set the local rank
+        # local_rank is not available in torch ProcessGroup,
+        # see https://github.com/pytorch/pytorch/issues/122816
+        if local_rank == -1 and distributed_init_method == "env://":
+            local_rank = int(os.environ['LOCAL_RANK'])
         global _LOCAL_RANK
         _LOCAL_RANK = local_rank
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 2203570b37ad6..39ad428f16fe3 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -298,12 +298,9 @@ def init_worker_distributed_environment(
     elif parallel_config.world_size > 1:
         # NOTE(woosuk): We don't initialize pynccl process group when world size
         # is 1.
-        pynccl_utils.init_process_group(
-            world_size=parallel_config.world_size,
-            local_rank=local_rank,
-            rank=rank,
-            init_method=distributed_init_method,
-        )
+        # NOTE(kaichao): By default, pynccl will use information inside
+        # `parallel_state` for initialization.
+        pynccl_utils.init_process_group()
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)

From ffc35938096951490be46b9bcded05a2dda00209 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 23 Apr 2024 18:33:12 -0700
Subject: [PATCH 084/106] [CI][Build] change pynvml to nvidia-ml-py (#4302)

---
 requirements-cuda.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index c6d2cd46aee54..1bddae4c6f40f 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -3,7 +3,7 @@
 
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
-pynvml == 11.5.0
+nvidia-ml-py # for pynvml package
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.2.1
 xformers == 0.0.25  # Requires PyTorch 2.2.1

From cc2c2f2faeefa344c47c08f66926ed13e4984e06 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 23 Apr 2024 18:54:33 -0700
Subject: [PATCH 085/106] [Misc] Reduce supported Punica dtypes (#4304)

---
 CMakeLists.txt                          | 12 --------
 csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu |  4 ---
 csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu |  4 ---
 csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu |  4 ---
 csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu |  4 ---
 csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu |  4 ---
 csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu |  4 ---
 csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu |  4 ---
 csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu |  4 ---
 csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu |  4 ---
 csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu |  4 ---
 csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu |  4 ---
 csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu |  4 ---
 csrc/punica/bgmv/generator.py           | 20 ++++++++++++
 csrc/punica/punica_ops.cc               | 17 ++++++++++
 tests/lora/test_layers.py               | 41 +++++++++++++++++--------
 16 files changed, 66 insertions(+), 72 deletions(-)
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4a99985d9abc4..e9262b57d0867 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,23 +212,11 @@ define_gpu_extension_target(
 
 set(VLLM_PUNICA_EXT_SRC
   "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
-  "csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
   "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
-  "csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
   "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
   "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
   "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
   "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
   "csrc/punica/punica_ops.cc")
 
 #
diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
deleted file mode 100644
index e8202dff561d9..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
deleted file mode 100644
index 3e7cf31dead0f..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
deleted file mode 100644
index 68277fa6b7d56..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
deleted file mode 100644
index 3b7531b8fbcfc..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
deleted file mode 100644
index b3b74aa3ec904..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
deleted file mode 100644
index 3cc87f5df76a1..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
deleted file mode 100644
index 9eda98bd8ddcf..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
deleted file mode 100644
index 060f9ebb8c2b1..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
deleted file mode 100644
index b37e44570bf40..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
deleted file mode 100644
index 06718cbb0a3e9..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
deleted file mode 100644
index 41fb0e45ef4e6..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
deleted file mode 100644
index 50b7ead9fcefd..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half)
diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py
index c347d4f2ab9f4..9bf7f6358880f 100644
--- a/csrc/punica/bgmv/generator.py
+++ b/csrc/punica/bgmv/generator.py
@@ -18,6 +18,26 @@
             if weight_dtype == "fp32":
                 # FP32 weights are not supported.
                 continue
+            if output_dtype == "fp32":
+                # LoRA A matrix.
+                if input_dtype != weight_dtype:
+                    # NOTE(woosuk): While Punica supports the case where the
+                    # input and weight dtypes are different, we only generate
+                    # the kernels the same dtypes to reduce the binary size.
+                    continue
+            elif input_dtype == "fp32":
+                # LoRA B matrix.
+                if output_dtype != weight_dtype:
+                    # NOTE(woosuk): While Punica supports the case where the
+                    # output and weight dtypes are different, we only generate
+                    # the kernels the same dtypes to reduce the binary size.
+                    continue
+            elif not (input_dtype == output_dtype == weight_dtype):
+                # NOTE(woosuk): While Punica supports mixed data types for
+                # input, output, and weight, we only generate the kernels with
+                # the same data types to reduce the binary size.
+                continue
+
             kernel_definition = TEMPLATE.format(
                 input_dtype=DTYPE_MAP[input_dtype],
                 output_dtype=DTYPE_MAP[output_dtype],
diff --git a/csrc/punica/punica_ops.cc b/csrc/punica/punica_ops.cc
index 7ebfd851c4feb..a1eaa90e85f27 100644
--- a/csrc/punica/punica_ops.cc
+++ b/csrc/punica/punica_ops.cc
@@ -50,6 +50,23 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
                                int64_t y_offset, int64_t full_y_size,
                                int64_t batch_size, int64_t num_layers,
                                int64_t layer_idx, float scale) {
+  // NOTE(woosuk): While Punica supports various combinations of input/output
+  // data types, we limit the supported data types to reduce the binary size.
+  constexpr bool is_input_float = std::is_same<in_T, float>::value;
+  constexpr bool is_output_float = std::is_same<out_T, float>::value;
+  if (is_input_float) {
+    if (!std::is_same<out_T, W_T>::value) {
+      return false;
+    }
+  } else if (is_output_float) {
+    if (!std::is_same<in_T, W_T>::value) {
+      return false;
+    }
+  } else if (!(std::is_same<in_T, W_T>::value &&
+               std::is_same<out_T, W_T>::value)) {
+    return false;
+  }
+
   switch (pack_u32(in_features, out_features)) {
 #define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out)                   \
   case pack_u32(feat_in, feat_out):                                            \
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 3fed7d2a0826f..522a91dac57c8 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -422,7 +422,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
 
     def _pretest():
         linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
-                                1024, vocab_size)
+                                1024,
+                                vocab_size,
+                                params_dtype=torch.float16)
         linear.weight.data = torch.rand_like(linear.weight.data)
         linear.weight.data[:, vocab_size:] = 0
         logits_processor = LogitsProcessor(
@@ -454,7 +456,7 @@ def _pretest():
             num_inputs=8 * num_loras,  # * 3,
             input_size=(1, 1024),
             input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
         )
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
 
@@ -503,7 +505,7 @@ def _pretest():
             num_inputs=8 * num_loras * 3,
             input_size=(1, 1024),
             input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
         )
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
 
@@ -545,11 +547,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
-            linear = RowParallelLinear(4096, 4096, bias=False)
+            linear = RowParallelLinear(4096,
+                                       4096,
+                                       bias=False,
+                                       params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = RowParallelLinearWithLoRA(linear)
         else:
-            linear = ColumnParallelLinear(4096, 4096, bias=False)
+            linear = ColumnParallelLinear(4096,
+                                          4096,
+                                          bias=False,
+                                          params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = ColumnParallelLinearWithLoRA(linear)
         lora_linear.create_lora_weights(max_loras, lora_config)
@@ -573,7 +581,7 @@ def create_random_linear_parallel_layer():
             num_inputs=32 * num_loras,
             input_size=(1, 4096),
             input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
         )
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
 
@@ -612,7 +620,7 @@ def create_random_linear_parallel_layer():
             num_inputs=32 * num_loras,
             input_size=(1, 4096),
             input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
         )
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
 
@@ -648,15 +656,24 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
     def create_column_parallel_packed_layer():
         if repeats == 2:
             linear = MergedColumnParallelLinear(4096, [4096] * repeats,
-                                                bias=False)
+                                                bias=False,
+                                                params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = MergedColumnParallelLinearWithLoRA(linear)
         elif repeats == 3:
-            linear = QKVParallelLinear(4096, 64, 32, bias=False)
+            linear = QKVParallelLinear(4096,
+                                       64,
+                                       32,
+                                       bias=False,
+                                       params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = MergedQKVParallelLinearWithLora(linear)
         else:
-            linear = QKVParallelLinear(4096, 64, 32, bias=False)
+            linear = QKVParallelLinear(4096,
+                                       64,
+                                       32,
+                                       bias=False,
+                                       params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = QKVParallelLinearWithLora(linear)
 
@@ -691,7 +708,7 @@ class FakeConfig:
             num_inputs=32 * num_loras,
             input_size=(1, 4096),
             input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
         )
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
 
@@ -731,7 +748,7 @@ class FakeConfig:
             num_inputs=32 * num_loras,
             input_size=(1, 4096),
             input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
         )
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
 

From afd3970ce5ab067c357dcf24b6a1b9022961d97b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 24 Apr 2024 09:00:20 -0700
Subject: [PATCH 086/106] [Core][Distributed] use existing torch.cuda.device
 (#4318)

[Core][Distributed] use existing torch.cuda.device context manager (#4318)
---
 vllm/distributed/device_communicators/pynccl.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index fcedf0fed34cb..e922beba44bfa 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -250,15 +250,13 @@ def __init__(
         assert isinstance(device, torch.device)
         self.device = device
         # nccl communicator and stream will use this device
-        current_device = torch.cuda.current_device()
-        try:
-            torch.cuda.set_device(device)
+        # `torch.cuda.device` is a context manager that changes the
+        # current cuda device to the specified one
+        with torch.cuda.device(device):
             NCCL_CHECK(
                 _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size,
                                     self.unique_id, self.rank))
             self.stream = torch.cuda.Stream()
-        finally:
-            torch.cuda.set_device(current_device)
 
     def all_reduce(self,
                    tensor: torch.Tensor,

From 27ced33ff699e3581a445911d8fbc28f87b32263 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 24 Apr 2024 09:49:13 -0700
Subject: [PATCH 087/106] [Misc] Update ShareGPT Dataset Sampling in Serving
 Benchmark (#4279)

---
 benchmarks/benchmark_serving.py | 50 ++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index a1810bed0b81b..3df71ffa5662d 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -29,7 +29,7 @@
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import AsyncGenerator, List, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple
 
 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -60,7 +60,11 @@ def sample_sharegpt_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
@@ -70,38 +74,32 @@ def sample_sharegpt_requests(
     dataset = [(data["conversations"][0]["value"],
                 data["conversations"][1]["value"]) for data in dataset]
 
-    # some of these will be filtered out, so sample more than we need
-    sampled_indices = random.sample(range(len(dataset)),
-                                    int(num_requests * 1.2))
-    dataset = [dataset[i] for i in sampled_indices]
-
-    # Tokenize the prompts and completions.
-    prompts = [prompt for prompt, _ in dataset]
-    prompt_token_ids = tokenizer(prompts).input_ids
-    completions = [completion for _, completion in dataset]
-    completion_token_ids = tokenizer(completions).input_ids
-    tokenized_dataset = []
-    for i in range(len(dataset)):
-        output_len = len(completion_token_ids[i])
-        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+    # Shuffle the dataset.
+    random.shuffle(dataset)
 
-    # Filter out too long sequences.
+    # Filter out sequences that are too long or too short
     filtered_dataset: List[Tuple[str, int, int]] = []
-    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
         if prompt_len < 4 or output_len < 4:
             # Prune too short sequences.
-            # This is because TGI causes errors when the input or output length
-            # is too short.
             continue
         if prompt_len > 1024 or prompt_len + output_len > 2048:
             # Prune too long sequences.
             continue
         filtered_dataset.append((prompt, prompt_len, output_len))
 
-    # Sample the requests.
-    sampled_requests = random.sample(filtered_dataset, num_requests)
-    return sampled_requests
+    return filtered_dataset
 
 
 def sample_sonnet_requests(
@@ -363,6 +361,7 @@ def main(args: argparse.Namespace):
             dataset_path=args.dataset,
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
         )
 
     elif args.dataset_name == "sharegpt":
@@ -370,6 +369,7 @@ def main(args: argparse.Namespace):
             dataset_path=args.dataset_path,
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
         )
 
     elif args.dataset_name == "sonnet":
@@ -526,6 +526,12 @@ def main(args: argparse.Namespace):
         default=1000,
         help="Number of prompts to process.",
     )
+    parser.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.")
     parser.add_argument(
         "--sonnet-input-len",
         type=int,

From a110633be55e76bee3c7b87b09178f45ecfb538d Mon Sep 17 00:00:00 2001
From: alexm-nm <59768536+alexm-nm@users.noreply.github.com>
Date: Wed, 24 Apr 2024 13:35:01 -0400
Subject: [PATCH 088/106] [Bugfix] Fix marlin kernel crash on H100 (#4218)

This PR addresses the Marlin kernel H100 crash that was reported here: neuralmagic#187.
The reason for the crash was the inline PTX assembly that introduced the async_copy with streaming behavior. The solution is to use the more standard PTX for async_copy (without the fractional L2 policy for "evict_first"). There is no performance difference between standard async_copy PTX and the previous one.
---
 .../quantization/marlin/marlin_cuda_kernel.cu | 23 +++++++------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/csrc/quantization/marlin/marlin_cuda_kernel.cu b/csrc/quantization/marlin/marlin_cuda_kernel.cu
index cf1b0afdec8b4..002a70001885d 100644
--- a/csrc/quantization/marlin/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/marlin_cuda_kernel.cu
@@ -67,20 +67,13 @@ __device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
                "r"(smem), "l"(glob_ptr), "n"(BYTES));
 }
 
-// Asynchronous global->shared copy with a cache hint indicating that the values
-// may be evicted immediately; used for quantized weights B, which are only
-// accessed precisely once and should thus not pollute the L2 cache which we
-// need for inputs A and outputs C.
-__device__ inline void cp_async4_stream(void *smem_ptr, const void *glob_ptr) {
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
   const int BYTES = 16;
   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   .reg .b64 p;\n"
-      "   createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
-      "   cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
+  asm volatile("{\n"
+               "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+               "}\n" :: "r"(smem), "l"(glob_ptr), "n"(BYTES));
 }
 
 // Async copy fence.
@@ -448,14 +441,14 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
       int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
 #pragma unroll
       for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
         B_ptr[i] += b_gl_rd_delta_o;
       }
       // Only fetch scales if this tile starts a new group
       if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
         int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
         if (s_sh_wr_pred)
-          cp_async4_stream(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
         s_gl_rd += s_gl_rd_delta;
       }
     }
@@ -750,7 +743,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
       // write-out
       if (group_blocks == -1 && last) {
         if (s_sh_wr_pred)
-          cp_async4_stream(&sh_s[s_sh_wr], &s[s_gl_rd]);
+          cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
         cp_async_fence();
       }
       thread_block_reduce();

From 50f4e481dfe700a6023e89dc4e4ad8c53de7b89b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 24 Apr 2024 14:09:44 -0700
Subject: [PATCH 089/106] [Doc] Add note for docker user (#4340)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 docs/source/serving/deploying_with_docker.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 7ec769630300d..cfc462ff33b90 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -49,3 +49,6 @@ To run vLLM:
         --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
         vllm/vllm-openai <args...>
 
+.. note::
+
+        vLLM docker image is currently designed to be run under the root user (contribution welcomed for changing this!). It will try to load library at runtime under the root user's home directory, e.g. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . If you are running the container under a different user, you may need to change the permissions of the library (and all the parent directories) to allow the user to access it. Then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .

From 382fb33058a65f8921215bb2d3470c70e42b7857 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Wed, 24 Apr 2024 14:10:24 -0700
Subject: [PATCH 090/106] [Misc] Use public API in benchmark_throughput (#4300)

---
 benchmarks/benchmark_throughput.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6bb889d1eceba..695d06e7b243d 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -103,25 +103,22 @@ def run_vllm(
     )
 
     # Add the requests to the engine.
+    prompts = []
+    sampling_params = []
     for prompt, _, output_len in requests:
-        sampling_params = SamplingParams(
-            n=n,
-            temperature=0.0 if use_beam_search else 1.0,
-            top_p=1.0,
-            use_beam_search=use_beam_search,
-            ignore_eos=True,
-            max_tokens=output_len,
-        )
-        # FIXME(woosuk): Do not use internal method.
-        llm._add_request(
-            prompt=prompt,
-            prompt_token_ids=None,
-            sampling_params=sampling_params,
-        )
+        prompts.append(prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
 
     start = time.perf_counter()
-    # FIXME(woosuk): Do not use internal method.
-    llm._run_engine(use_tqdm=True)
+    llm.generate(prompts, sampling_params, use_tqdm=True)
     end = time.perf_counter()
     return end - start
 

From e207f23904b2ddc0fe77eb416c010cf11dbc974c Mon Sep 17 00:00:00 2001
From: Caio Mendes <caioctmendes@gmail.com>
Date: Thu, 25 Apr 2024 00:06:57 -0300
Subject: [PATCH 091/106] [Model] Adds Phi-3 support (#4298)

---
 docs/source/models/supported_models.rst       |   4 +
 vllm/config.py                                |   2 +-
 .../model_executor/layers/rotary_embedding.py | 136 +++++++++++++++++-
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/llama.py           |  14 +-
 5 files changed, 148 insertions(+), 9 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 951fc3aac0c75..f4dd5d52ad873 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -115,6 +115,10 @@ Alongside each architecture, we include some popular models that use it.
     - Phi
     - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
     -
+  * - :code:`Phi3ForCausalLM`
+    - Phi-3
+    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
+    -
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
diff --git a/vllm/config.py b/vllm/config.py
index 8ccc000f53a25..7f1bb70274e3d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1096,7 +1096,7 @@ def _get_and_verify_max_len(
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
-    if rope_scaling is not None:
+    if rope_scaling is not None and rope_scaling["type"] != "su":
         assert "factor" in rope_scaling
         scaling_factor = rope_scaling["factor"]
         if rope_scaling["type"] == "yarn":
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index a5225148d7828..b8361af61ae3f 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -338,6 +338,114 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
         return cache
 
 
+class Phi3SuScaledRotaryEmbedding(nn.Module):
+    """Phi3 family of models scaled rotary embedding.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        original_max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        short_factor: List[float],
+        long_factor: List[float],
+        short_mscale: float = 1.1,
+        long_mscale: float = 1.225,
+    ):
+        super().__init__()
+
+        if rotary_dim != head_size:
+            raise ValueError(
+                f"`Phi3SuScaledRotaryEmbedding` does not support rotary_dim != \
+                    head_size ({rotary_dim}!={head_size}).")
+        if is_neox_style is False:
+            raise ValueError(
+                "`Phi3SuScaledRotaryEmbedding` only supports neox_style.")
+
+        self.head_size = head_size
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
+
+        short_cache = self._compute_cos_sin_cache(
+            original_max_position_embeddings, short_factor, short_mscale)
+        short_cache = short_cache.to(torch.get_default_dtype())
+        self.register_buffer("short_cos_sin_cache",
+                             short_cache,
+                             persistent=False)
+
+        long_cache = self._compute_cos_sin_cache(max_position_embeddings,
+                                                 long_factor, long_mscale)
+        long_cache = long_cache.to(torch.get_default_dtype())
+        self.register_buffer("long_cos_sin_cache",
+                             long_cache,
+                             persistent=False)
+
+        long_short_cache = torch.cat(
+            [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0)
+        self.register_buffer("long_short_cos_sin_cache",
+                             long_short_cache,
+                             persistent=False)
+
+    def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor:
+        rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
+        inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange(
+            0, self.head_size, 2, dtype=torch.float) / self.head_size)))
+        return inv_freq
+
+    def _compute_cos_sin_cache(
+        self,
+        max_position_embeddings: int,
+        rescale_factors: List[float],
+        mscale: float,
+    ) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(rescale_factors)
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * mscale
+        sin = freqs.sin() * mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+
+        k = self.original_max_position_embeddings
+        long_prompt_offset = (torch.any(positions > k).float() *
+                              torch.full_like(positions, k)).long()
+        idx = (torch.add(positions, long_prompt_offset)
+               if long_prompt_offset is not None else positions)
+        self.long_short_cos_sin_cache = self.long_short_cos_sin_cache.to(
+            idx.device)
+        idx = torch.add(idx, offsets) if offsets is not None else idx
+        cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = cos.repeat(1, 2).unsqueeze(-2)
+        sin = sin.repeat(1, 2).unsqueeze(-2)
+
+        query = query * cos + _rotate_neox(query) * sin
+        key = key * cos + _rotate_neox(key) * sin
+
+        return query.flatten(-2), key.flatten(-2)
+
+
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
 
@@ -349,17 +457,26 @@ def get_rope(
     is_neox_style: bool = True,
     rope_scaling: Optional[Dict[str, Any]] = None,
 ) -> RotaryEmbedding:
+    if rope_scaling is not None:
+        # Transforms every value that is a list into a tuple for caching calls
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
     key = (head_size, rotary_dim, max_position, base, is_neox_style,
-           tuple(rope_scaling.items()) if rope_scaling is not None else None)
+           rope_scaling_args)
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
-
     if rope_scaling is None:
         rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
                                      is_neox_style)
     else:
         scaling_type = rope_scaling["type"]
-        scaling_factor = rope_scaling["factor"]
+        if scaling_type != "su":
+            scaling_factor = rope_scaling["factor"]
         if scaling_type == "linear":
             rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
                                                       max_position, base,
@@ -383,6 +500,19 @@ def get_rope(
                                                     base, is_neox_style,
                                                     scaling_factor,
                                                     **extra_kwargs)
+        elif scaling_type == "su":
+            short_factor = rope_scaling["short_factor"]
+            long_factor = rope_scaling["long_factor"]
+            original_max_position = rope_scaling[
+                "original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("short_mscale", "long_mscale")
+            }
+            rotary_emb = Phi3SuScaledRotaryEmbedding(
+                head_size, rotary_dim, max_position, original_max_position,
+                base, is_neox_style, short_factor, long_factor, **extra_kwargs)
         else:
             raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     _ROPE_DICT[key] = rotary_emb
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 17fc970568042..c0aeab5dd3032 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -46,6 +46,7 @@
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
+    "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 016e3b039d1e8..c102b40045c92 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -180,6 +180,10 @@ def __init__(
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
         max_position_embeddings = getattr(config, "max_position_embeddings",
                                           8192)
         sliding_window = getattr(config, "sliding_window", None)
@@ -378,11 +382,11 @@ def sample(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:

From b290035094758045ceaf253b79c5a739fedd0ade Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 24 Apr 2024 23:52:22 -0700
Subject: [PATCH 092/106] [Core] Move ray_utils.py from `engine` to `executor`
 package (#4347)

---
 vllm/__init__.py                                       |  2 +-
 vllm/engine/async_llm_engine.py                        |  2 +-
 vllm/engine/llm_engine.py                              |  2 +-
 vllm/executor/ray_gpu_executor.py                      | 10 ++++++----
 vllm/{engine => executor}/ray_utils.py                 |  0
 vllm/transformers_utils/tokenizer_group/__init__.py    |  2 +-
 .../tokenizer_group/ray_tokenizer_group.py             |  2 +-
 7 files changed, 11 insertions(+), 9 deletions(-)
 rename vllm/{engine => executor}/ray_utils.py (100%)

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 3585c0944f7e3..56cf5c01644fc 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -3,8 +3,8 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.ray_utils import initialize_ray_cluster
 from vllm.entrypoints.llm import LLM
+from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.model_executor.models import ModelRegistry
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 3a2f7db679358..4b007d71e9cfc 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -10,7 +10,7 @@
 from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.ray_utils import initialize_ray_cluster, ray
+from vllm.executor.ray_utils import initialize_ray_cluster, ray
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 703cba8f88aa2..f04462db54ef2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -15,8 +15,8 @@
     SequenceGroupOutputProcessor)
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.engine.output_processor.util import create_output_by_sequence_group
-from vllm.engine.ray_utils import initialize_ray_cluster
 from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index e69f104e7d5a4..14b3f803782c6 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -5,8 +5,8 @@
 from itertools import islice, repeat
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
 
-from vllm.engine.ray_utils import RayWorkerWrapper, ray
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
@@ -74,7 +74,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # The driver dummy worker does not actually use any resources.
         # It holds the resource for the driver worker.
-        self.driver_dummy_worker: RayWorkerWrapper = None
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
         # The remaining workers are the actual ray actors.
         self.workers: List[RayWorkerWrapper] = []
 
@@ -318,6 +318,7 @@ def _run_workers(
             driver_worker_output = self.driver_worker.execute_method(
                 method, *driver_args, **driver_kwargs)
         else:
+            assert self.driver_dummy_worker is not None
             driver_worker_output = ray.get(
                 self.driver_dummy_worker.execute_method.remote(
                     method, *driver_args, **driver_kwargs))
@@ -353,8 +354,9 @@ def _compiled_ray_dag(self):
         # a dummy value for now. It will be fixed soon.
         with InputNode() as input_data:
             forward_dag = MultiOutputNode([
-                worker.execute_model_compiled_dag_remote.bind(input_data)
-                for worker in self.workers
+                worker.execute_model_compiled_dag_remote.
+                bind(  # type: ignore[attr-defined]
+                    input_data) for worker in self.workers
             ])
         return forward_dag.experimental_compile()
 
diff --git a/vllm/engine/ray_utils.py b/vllm/executor/ray_utils.py
similarity index 100%
rename from vllm/engine/ray_utils.py
rename to vllm/executor/ray_utils.py
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 69380d67f9b94..0195c40c27f60 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,7 +1,7 @@
 from typing import Optional
 
 from vllm.config import TokenizerPoolConfig
-from vllm.engine.ray_utils import ray
+from vllm.executor.ray_utils import ray
 from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
     BaseTokenizerGroup)
 from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index f3cdc00564dbb..7c605416854b8 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -6,7 +6,7 @@
 from transformers import PreTrainedTokenizer
 
 from vllm.config import TokenizerPoolConfig
-from vllm.engine.ray_utils import ray
+from vllm.executor.ray_utils import ray
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
     BaseTokenizerGroup)

From bd92e7627a37ab82970a7f4abe42523fa7f57b90 Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Fri, 26 Apr 2024 00:35:56 +0800
Subject: [PATCH 093/106] [Bugfix][Model] Refactor OLMo model to support new HF
 format in transformers 4.40.0 (#4324)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 docs/source/models/supported_models.rst |   2 +-
 requirements-dev.txt                    |   1 -
 vllm/model_executor/models/__init__.py  |   2 +-
 vllm/model_executor/models/olmo.py      | 305 ++++++++++++------------
 4 files changed, 149 insertions(+), 161 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f4dd5d52ad873..ceb658bbd5c66 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -101,7 +101,7 @@ Alongside each architecture, we include some popular models that use it.
     -
   * - :code:`OLMoForCausalLM`
     - OLMo
-    - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
+    - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
     -
   * - :code:`OPTForCausalLM`
     - OPT, OPT-IML
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 1317e51b2dd11..d9816828d007d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -26,7 +26,6 @@ requests
 ray
 peft
 awscli
-ai2-olmo # required for OLMo
 
 # Benchmarking
 aiohttp
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index c0aeab5dd3032..6afb2f31c1334 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -42,7 +42,7 @@
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
-    "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"),
+    "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index b92003bc0e067..15527569b9e20 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,53 +1,36 @@
 # coding=utf-8
 # Adapted from
-# https://github.com/allenai/OLMo/blob/v0.2.4/olmo/model.py and
-# https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/modeling_olmo.py
-# Copyright 2023 The vLLM team.
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
-# BSD 3-Clause License
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
-# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
-# All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Inference-only OLMo model compatible with HuggingFace weights."""
 from typing import Iterable, List, Optional, Tuple
 
 import torch
-# this model must need this dependency
-from hf_olmo import OLMoConfig
 from torch import nn
+from transformers import OlmoConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearMethodBase,
+from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
@@ -55,7 +38,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
@@ -70,55 +53,52 @@ class OlmoAttention(nn.Module):
 
     def __init__(
         self,
-        config: OLMoConfig,
+        config: OlmoConfig,
         linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
         self.config = config
-        self.hidden_size = config.d_model
-        assert config.d_model % config.n_heads == 0
+        self.hidden_size = config.hidden_size
         tensor_model_parallel_world_size = (
             get_tensor_model_parallel_world_size())
-        self.total_num_heads = self.config.n_heads
+        self.total_num_heads = config.num_attention_heads
+
+        assert self.hidden_size % self.total_num_heads == 0
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
+
         self.num_heads = (self.total_num_heads //
                           tensor_model_parallel_world_size)
         self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.clip_qkv = config.clip_qkv
 
-        # Layer norms.
-        self.attn_norm = nn.LayerNorm(config.d_model,
-                                      elementwise_affine=False,
-                                      bias=False)
         # Attention input projection. Projects x -> (q, k, v)
-        self.att_proj = QKVParallelLinear(
-            config.d_model,
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
             self.head_dim,
             self.total_num_heads,
-            bias=config.include_bias,
+            bias=config.attention_bias,
             linear_method=linear_method,
         )
 
         # Rotary embeddings.
-        if self.config.rope:
-            rope_theta = getattr(config, "rope_theta", 10000)
-            max_position_embeddings = getattr(config,
-                                              "max_position_embeddings", 8192)
-            self.rotary_emb = get_rope(
-                self.head_dim,
-                rotary_dim=self.head_dim,
-                max_position=max_position_embeddings,
-                base=rope_theta,
-            )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               scale=self.scaling)
 
         # Attention output projection.
-        self.attn_out = RowParallelLinear(
-            config.d_model,
-            config.d_model,
-            bias=config.include_bias,
+        self.o_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=config.attention_bias,
             linear_method=linear_method,
         )
 
@@ -129,13 +109,13 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        hidden_states = self.attn_norm(hidden_states)
-        qkv, _ = self.att_proj(hidden_states)
+        qkv, _ = self.qkv_proj(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
-        if self.config.rope:
-            q, k = self.rotary_emb(positions, q, k)
+        q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
-        output, _ = self.attn_out(attn_output)
+        output, _ = self.o_proj(attn_output)
         return output
 
 
@@ -148,37 +128,30 @@ class OlmoMLP(nn.Module):
 
     def __init__(
         self,
-        config: OLMoConfig,
+        config: OlmoConfig,
         linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
         self.config = config
-        self.hidden_size = (config.mlp_hidden_size if config.mlp_hidden_size
-                            is not None else config.mlp_ratio * config.d_model)
-
-        # Layer norms.
-        self.ff_norm = nn.LayerNorm(config.d_model,
-                                    elementwise_affine=False,
-                                    bias=False)
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
 
         # Feed-forward input projection.
-        self.ff_proj = MergedColumnParallelLinear(
-            config.d_model,
-            [self.hidden_size // 2] * 2,
-            bias=config.include_bias,
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
             linear_method=linear_method,
         )
 
         # Activation function.
-        self.act = SiluAndMul()
-        self.act.output_multiplier = 0.5
-        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+        self.act_fn = SiluAndMul()
 
         # Feed-forward output projection.
-        self.ff_out = RowParallelLinear(
-            int(self.act.output_multiplier * self.hidden_size),
-            config.d_model,
-            bias=config.include_bias,
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
             linear_method=linear_method,
         )
 
@@ -186,19 +159,13 @@ def forward(
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        # Add feed-forward projection.
-        # shape: (batch_size, seq_len, d_model)
-        og_x = x
-        x = self.ff_norm(x)
-        x, _ = self.ff_proj(x)
-        x = self.act(x)
-        x, _ = self.ff_out(x)
-        x = og_x + x
-
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
         return x
 
 
-class OlmoBlock(nn.Module):
+class OlmoDecoderLayer(nn.Module):
     """
     This is a typical transformer block where the output is
     computed as ``MLP(LN(x + Attention(LN(x))))``
@@ -206,15 +173,23 @@ class OlmoBlock(nn.Module):
     """
 
     def __init__(self,
-                 config: OLMoConfig,
+                 config: OlmoConfig,
                  linear_method: Optional[LinearMethodBase] = None):
         super().__init__()
         # Attention block.
-        self.attn = OlmoAttention(config, linear_method)
+        self.self_attn = OlmoAttention(config, linear_method)
 
         # MLP block.
         self.mlp = OlmoMLP(config, linear_method)
 
+        # LayerNorm
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            elementwise_affine=False,
+                                            bias=False)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     elementwise_affine=False,
+                                                     bias=False)
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -223,52 +198,37 @@ def forward(
         attn_metadata: AttentionMetadata,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         # Attention block.
-        og_x = hidden_states
-        x = self.attn(positions, hidden_states, kv_cache, attn_metadata)
-        x = x + og_x
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(positions, hidden_states, kv_cache,
+                                       attn_metadata)
+        hidden_states = hidden_states + residual
 
         # MLP block.
-        hidden_states = self.mlp(x)
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
         return hidden_states
 
 
 class OlmoModel(nn.Module):
 
     def __init__(self,
-                 config: OLMoConfig,
+                 config: OlmoConfig,
                  linear_method: Optional[LinearMethodBase] = None):
         super().__init__()
         self.config = config
 
-        self.transformer = nn.ModuleDict(
-            dict(
-                wte=VocabParallelEmbedding(
-                    config.embedding_size or config.vocab_size,
-                    config.d_model,
-                ),
-                ln_f=nn.LayerNorm(config.d_model,
-                                  elementwise_affine=False,
-                                  bias=False),
-            ))
-
-        blocks = [
-            OlmoBlock(config, linear_method) for i in range(config.n_layers)
-        ]
-        if self.config.block_group_size > 1:
-            raise NotImplementedError("Block group size > 1 not supported yet")
-        else:
-            self.transformer.update({"blocks": nn.ModuleList(blocks)})
-
-        if not config.weight_tying:
-            self.transformer.update({
-                "ff_out":
-                ColumnParallelLinear(
-                    config.d_model,
-                    config.embedding_size or config.vocab_size,
-                    bias=config.include_bias,
-                    linear_method=linear_method,
-                )
-            })
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.hidden_size)
+        self.layers = nn.ModuleList([
+            OlmoDecoderLayer(config, linear_method)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 elementwise_affine=False,
+                                 bias=False)
 
     def forward(
         self,
@@ -282,39 +242,49 @@ def forward(
         """
         # Get embeddings of input.
         # shape: (batch_size, seq_len, d_model)
-        x = self.transformer.wte(input_ids)  # type: ignore
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # embed positions
+        hidden_states = inputs_embeds
 
         # Apply blocks one-by-one.
-        for block_idx, block in enumerate(self.transformer.blocks):
+        for layer_idx, decoder_layer in enumerate(self.layers):
             # shape: (batch_size, seq_len, d_model)
-            x = block(
+            hidden_states = decoder_layer(
                 positions,
-                x,
-                kv_caches[block_idx],
+                hidden_states,
+                kv_caches[layer_idx],
                 attn_metadata,
             )
 
         # Apply final layer norm.
         # shape: (batch_size, seq_len or 1, d_model)
-        x = self.transformer.ln_f(x)  # type: ignore
-        return x
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
 
 
-class OLMoForCausalLM(nn.Module):
+class OlmoForCausalLM(nn.Module):
     """
     Extremely barebones HF model wrapper.
     """
 
     def __init__(self,
-                 config: OLMoConfig,
+                 config: OlmoConfig,
                  linear_method: Optional[LinearMethodBase] = None):
         super().__init__()
         self.config = config
         self.linear_method = linear_method
         self.model = OlmoModel(config, linear_method)
-        self.lm_head_weight = (self.model.transformer.wte.weight
-                               if config.weight_tying else
-                               self.model.transformer.ff_out.weight)
+        if config.tie_word_embeddings:
+            self.lm_head_weight = self.model.embed_tokens.weight
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+            self.lm_head_weight = self.lm_head.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
@@ -348,20 +318,39 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         for name, loaded_weight in weights:
-            # attention
-            if ".att" in name:
-                name = name.replace(".att", ".attn.att")
-            # mlp
-            if ".ff_proj" in name:
-                name = name.replace(".ff_proj", ".mlp.ff_proj")
-                # Reverse the weight for the MergeColumnParallelLinear
-                loaded_weight = torch.concat(loaded_weight.chunk(2)[::-1])
-            if ".ff_out" in name and "transformer.ff_out" not in name:
-                name = name.replace(".ff_out", ".mlp.ff_out")
-            # there is no bias in olmo
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)

From 74e20c2f2f335b66d7396b9b300b53d61b1fff6f Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:37:20 -0500
Subject: [PATCH 094/106] [CI/Build] Adding functionality to reset the node's
 GPUs before processing. (#4213)

---
 .buildkite/run-amd-test.sh | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 83a56e25aca73..38aff57a410dc 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -5,6 +5,19 @@ set -ex
 # Print ROCm version
 rocminfo
 
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+
+
 # Try building the docker image
 docker build -t rocm -f Dockerfile.rocm .
 
@@ -14,7 +27,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
-docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
+export HIP_VISIBLE_DEVICES=1
+docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
 
 # Wait for the server to start
 wait_for_server_to_start() {

From 0f38d718462e4e7d99f1836daf5f9bf6cc9b10ea Mon Sep 17 00:00:00 2001
From: Caio Mendes <caioctmendes@gmail.com>
Date: Thu, 25 Apr 2024 14:32:00 -0300
Subject: [PATCH 095/106] [Doc] README Phi-3 name fix. (#4372)

Co-authored-by: Caio Mendes <caiocesart@microsoft.com>

From fff6cd29628a069b44430e2ccdaa0a4268ec8483 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Thu, 25 Apr 2024 19:03:56 +0000
Subject: [PATCH 096/106] [Core]refactor aqlm quant ops  (#4351)

---
 benchmarks/kernels/benchmark_aqlm.py            |  2 +-
 vllm/_custom_ops.py                             | 14 ++++++++++++++
 vllm/model_executor/layers/quantization/aqlm.py |  2 +-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index 9602d20bcbc74..59392947b15c8 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -6,7 +6,7 @@
 import torch
 import torch.nn.functional as F
 
-from vllm._C import ops
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.aqlm import (
     dequantize_weight, generic_dequantize_gemm, get_int_dtype,
     optimized_dequantize_gemm)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e4b16ed918d1a..508d35656eb00 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -153,6 +153,20 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                                 size_n, size_k)
 
 
+# aqlm
+def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
+              codebooks: torch.Tensor, scales: torch.Tensor,
+              codebook_partition_sizes: torch.Tensor,
+              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    return vllm_ops.aqlm_gemm(input, codes, codebooks, scales,
+                              codebook_partition_sizes, bias)
+
+
+def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
+                 codebook_partition_sizes: torch.Tensor) -> torch.Tensor:
+    return vllm_ops.aqlm_dequant(codes, codebooks, codebook_partition_sizes)
+
+
 # fp8
 def scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     scale = torch.zeros(1, device=input.device, dtype=torch.float32)
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 6115b1de679ad..b48c6e1702be4 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -8,7 +8,7 @@
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (

From 9bb7eff780d7bc0d9ad5b563859435cb00f76d39 Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Fri, 26 Apr 2024 04:13:50 +0900
Subject: [PATCH 097/106] [Mypy] Typing lora folder (#4337)

---
 .github/workflows/mypy.yaml |  7 ++--
 format.sh                   |  2 +-
 vllm/lora/layers.py         | 35 ++++++++++++--------
 vllm/lora/lora.py           | 28 +++++++++-------
 vllm/lora/models.py         | 64 ++++++++++++++++++++-----------------
 vllm/lora/worker_manager.py | 21 ++++++------
 vllm/worker/model_runner.py |  4 +--
 7 files changed, 91 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 9f1855696e20a..089c7d18ad6f2 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -33,8 +33,6 @@ jobs:
     - name: Mypy
       run: |
         mypy vllm/attention --config-file pyproject.toml
-        # TODO(sang): Fix nested dir
-        mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
         mypy vllm/distributed --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
@@ -44,8 +42,9 @@ jobs:
         mypy vllm/engine  --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
         mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+
         # TODO(sang): Fix nested dir
         mypy vllm/model_executor/*.py  --config-file pyproject.toml
-        # TODO(sang): Fix nested dir
-        # mypy vllm/lora/*.py --config-file pyproject.toml
+        mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
 
diff --git a/format.sh b/format.sh
index bd2e9e89e1806..4ac1842daef0a 100755
--- a/format.sh
+++ b/format.sh
@@ -106,7 +106,7 @@ mypy vllm/engine  --config-file pyproject.toml
 mypy vllm/worker --config-file pyproject.toml
 mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/model_executor/*.py  --config-file pyproject.toml
-# mypy vllm/lora/*.py --config-file pyproject.toml
+mypy vllm/lora --config-file pyproject.toml
 
 
 CODESPELL_EXCLUDES=(
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index aac86351b15e1..98e74168002c4 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -176,6 +176,8 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
     def __init__(self, base_layer: VocabParallelEmbedding) -> None:
         super().__init__()
         self.base_layer = base_layer
+        self.embeddings_slice: Optional[Tuple[int, int]]
+        self.embeddings_weights: Optional[torch.Tensor]
 
     def create_lora_weights(
             self,
@@ -233,9 +235,10 @@ def create_lora_weights(
             self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
             self.lora_a_stacked.shape[2],
         )
-        self.indices: Optional[torch.Tensor] = None
-        self.indices_len: Optional[List[int]] = None
-        self.embeddings_indices = None
+        # Lazily initialized.
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
+        self.embeddings_indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -267,6 +270,7 @@ def set_lora(
                     self.embeddings_tensors.shape[1],
                     self.embeddings_tensors.shape[2]
                 )[self.embeddings_slice[0]:self.embeddings_slice[1]]
+                assert self.embeddings_weights is not None
                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
 
     def set_mapping(
@@ -343,11 +347,12 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-
-        self.indices: Optional[torch.Tensor] = None
-        self.indices_len: Optional[List[int]] = None
         self.output_dim = self.lora_b_stacked.shape[2]
 
+        # lazily initialized.
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
+
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -475,8 +480,9 @@ def create_lora_weights(
                 device=self.device,
             ) for _ in range(n_slices))
 
-        self.indices: Optional[torch.Tensor] = None
         self.output_dim = self.lora_b_stacked[0].shape[2]
+        # Lazily initialized.
+        self.indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -690,7 +696,8 @@ def create_lora_weights(
                               self.kv_proj_shard_size)
         self.packed_indices: Optional[torch.Tensor] = None
         self.standard_indices: Optional[torch.Tensor] = None
-        self.indices_len: Optional[List[int]] = None
+        # lazily initialized.
+        self.indices_len: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -814,8 +821,9 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-        self.indices: Optional[torch.Tensor] = None
-        self.indices_len: Optional[List[int]] = None
+        # Lazily initialized
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -991,9 +999,10 @@ def create_lora_weights(
             dtype=self.dtype,
             device=self.device,
         )
-        self.indices = None
-        self.indices_padded = None
-        self.indices_len = None
+        # Lazily initialized.
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
+        self.indices_padded: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index fefad16700fe3..d7794aa7cd35c 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -97,9 +97,9 @@ def __init__(
         self,
         module_name: str,
         rank: int,
-        lora_alphas: List[int],
-        lora_a: List[torch.Tensor],
-        lora_b: List[torch.Tensor],
+        lora_alphas: List[Optional[int]],
+        lora_a: List[Optional[torch.Tensor]],
+        lora_b: List[Optional[torch.Tensor]],
         scaling: Optional[List[float]] = None,
     ) -> None:
         super().__init__(
@@ -108,17 +108,20 @@ def __init__(
             lora_alpha=0,
             lora_a=lora_a,
             lora_b=lora_b,
-            scaling=scaling,
+            scaling=scaling,  # type: ignore
             embeddings_tensor=None,
         )
         self.lora_alphas = lora_alphas
         if scaling is None:
-            self.scaling = [
-                lora_alpha / self.rank for lora_alpha in self.lora_alphas
+            self.scaling = [  # type: ignore
+                lora_alpha / self.rank  # type: ignore # noqa
+                for lora_alpha in self.lora_alphas
             ]
 
     @classmethod
-    def pack(cls, loras: List["LoRALayerWeights"]) -> "PackedLoRALayerWeights":
+    def pack(
+            cls, loras: List[Optional["LoRALayerWeights"]]
+    ) -> "PackedLoRALayerWeights":
         """Pack a list of LoRAs into a single LoRA.
 
         If LoRA is None, it signifies that the submodule does not have a LoRA.
@@ -136,16 +139,19 @@ def pack(cls, loras: List["LoRALayerWeights"]) -> "PackedLoRALayerWeights":
             [lora.lora_alpha if lora is not None else None for lora in loras],
             [lora.lora_a if lora is not None else None for lora in loras],
             [lora.lora_b if lora is not None else None for lora in loras],
-            scaling=[1 if lora is not None else None for lora in loras])
+            scaling=[
+                1 if lora is not None else None  # type: ignore
+                for lora in loras
+            ])
         return obj
 
     def optimize(self) -> "PackedLoRALayerWeights":
         """Optimize the LoRA by merging the scaling into lora_b."""
         for i in range(len(self.lora_b)):
-            if self.scaling[i] == 1 or self.lora_b[i] is None:
+            if self.scaling[i] == 1 or self.lora_b[i] is None:  # type: ignore
                 continue
-            self.lora_b[i] *= self.scaling[i]
-            self.scaling[i] = 1
+            self.lora_b[i] *= self.scaling[i]  # type: ignore
+            self.scaling[i] = 1  # type: ignore
         return self
 
     @property
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 6bb9fee27d535..c249497a4d893 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -3,7 +3,7 @@
 import math
 import os
 import re
-from typing import Callable, Dict, Hashable, List, Optional, Tuple, Type
+from typing import Callable, Dict, List, Optional, Tuple, Type
 
 import safetensors.torch
 import torch
@@ -53,44 +53,46 @@ def convert_mapping(
                 embeddings.
             indices_len: List of lengths of the above tensors.
     """
-    indices = list(mapping.index_mapping).copy()
-    embedding_indices = indices.copy()
-    lora_indices = indices.copy()
-    prompt_mapping = [
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    prompt_mapping: List[int] = [
         lora_index_to_id.index(x) if x > 0 else -1
         for x in mapping.prompt_mapping
     ]
     lora_idx = None
-    for i in range(len(indices)):
+    for i in range(len(index_mapping_indices)):
         # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(indices[i])
-                    if indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if indices[i] > 0 else 0
-        indices[i] = i
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        index_mapping_indices[i] = i
         lora_indices[i] = lora_idx
 
-    indices = torch.tensor([indices, lora_indices, embedding_indices],
-                           dtype=torch.long,
-                           device="cuda")
-    prompt_mapping = torch.tensor(prompt_mapping,
-                                  device="cuda",
-                                  dtype=torch.long)
+    indices = torch.tensor(
+        [index_mapping_indices, lora_indices, embedding_indices],
+        dtype=torch.long,
+        device="cuda")
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         device="cuda",
+                                         dtype=torch.long)
     embeddings_indices = torch.stack([
         indices[2] * extra_vocab_size,
         indices[2] * (vocab_size + extra_vocab_size)
     ])
     embeddings_indices[embeddings_indices == -1] = max_loras - 1
     base_indices = indices[1]
-    sampler_indices = prompt_mapping
+    sampler_indices = prompt_mapping_tensor
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
     sampler_indices_padded = (
         torch.arange(
             0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
         (sampler_indices_padded * len(sampler_indices_padded)))
-    indices_len = (base_indices.shape[-1], sampler_indices.shape[-1],
-                   sampler_indices_padded.shape[-1],
-                   embeddings_indices.shape[-1])
+    indices_len = [
+        base_indices.shape[-1], sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
+    ]
 
     return (base_indices, sampler_indices, sampler_indices_padded,
             embeddings_indices, indices_len)
@@ -149,6 +151,7 @@ def from_lora_tensors(
             if module_name not in loras:
                 lora_embeddings_tensor = None
                 if embeddings:
+                    assert embedding_modules is not None
                     embeddings_module = next(
                         (k for k in embedding_modules if k in module_name),
                         None)
@@ -171,6 +174,7 @@ def from_lora_tensors(
             else:
                 loras[module_name].lora_b = tensor.to(device=device,
                                                       dtype=dtype).t()
+                assert embedding_padding_modules is not None
                 if any(name in module_name
                        for name in embedding_padding_modules
                        ) and target_embedding_padding is not None:
@@ -295,11 +299,10 @@ def __init__(
                                               self.max_num_batched_tokens,
                                               dtype=torch.long,
                                               device="cuda")
-        self.offsets = []
         # 4 is the number of indicies tensors defined above
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices
-        self.indices_len = [None] * 4
+        self.indices_len: List[Optional[int]] = [None] * 4
 
         self.model: nn.Module = model
         if hasattr(self.model, "supported_lora_modules"):
@@ -312,7 +315,7 @@ def __init__(
         self._registered_loras: Dict[int, LoRAModel] = {}
         # Dict instead of a Set for compatibility with LRUCache.
         self._active_loras: Dict[int, None] = {}
-        self._last_mapping = None
+        self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
         self.model.lora_manager = self
 
@@ -370,7 +373,7 @@ def deactivate_lora(self, lora_id: int) -> bool:
             return True
         return False
 
-    def _add_lora(self, lora: LoRAModel) -> bool:
+    def _add_lora(self, lora: LoRAModel):
         self._create_merged_loras_inplace(lora)
         self._registered_loras[lora.id] = lora
 
@@ -418,7 +421,7 @@ def list_loras(self) -> Dict[int, LoRAModel]:
     def get_lora(self, lora_id: int) -> Optional[LoRAModel]:
         return self._registered_loras.get(lora_id, None)
 
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         """Remove all LoRAModels from the manager."""
         self._registered_loras.clear()
         self.lora_index_to_id = [None] * self.lora_slots
@@ -467,6 +470,7 @@ def create_dummy_lora(
                 continue
             parts = module_name.split(".")
             if module_name not in self.packed_modules:
+                assert embedding_modules is not None
                 if parts[-1] in embedding_modules:
                     input_dim = (module.base_layer.org_vocab_size +
                                  self.lora_config.lora_extra_vocab_size if
@@ -500,7 +504,7 @@ def create_dummy_lora(
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
-                subloras = []
+                subloras: List[Optional["LoRALayerWeights"]] = []
                 for i, r in enumerate(replacements):
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name + "." + r,
@@ -538,7 +542,7 @@ def _register_packed_modules(self, module_full_name: str) -> None:
 
     def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
         for module_name, new_module_names in self.packed_modules.items():
-            replacement_loras = []
+            replacement_loras: List[Optional[LoRALayerWeights]] = []
             has_replacement = False
             for r in new_module_names:
                 lora = lora_model.get_lora(r)
@@ -557,12 +561,12 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
 
 class LoRALRUCache(LRUCache[LoRAModel]):
 
-    def __init__(self, capacity: int, deactivate_lora_fn: Callable[[Hashable],
-                                                                   None]):
+    def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
+                                                                   bool]):
         super().__init__(capacity)
         self.deactivate_lora_fn = deactivate_lora_fn
 
-    def _on_remove(self, key: Hashable, value: LoRAModel):
+    def _on_remove(self, key: int, value: LoRAModel):
         logger.debug(f"Removing LoRA. int id: {key}")
         self.deactivate_lora_fn(key)
         return super()._on_remove(key, value)
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 5356b79537b05..ec3c10c591a18 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod, abstractproperty
-from typing import Any, Dict, List, Optional, Set, Type
+from typing import Any, Dict, List, Set, Type
 
 import torch
 
@@ -37,7 +37,7 @@ def create_lora_manager(
         ...
 
     @abstractmethod
-    def set_active_loras(self, lora_requests: List[LoRARequest],
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:
         ...
 
@@ -54,7 +54,7 @@ def remove_lora(self, lora_id: int) -> bool:
         ...
 
     @abstractmethod
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         ...
 
     @abstractmethod
@@ -81,10 +81,11 @@ def __init__(
         embedding_padding_modules: List[str],
         lora_model_cls: Type[LoRAModel] = LoRAModel,
     ):
-        self._lora_manager: Optional[LoRAModelManager] = None
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
         self.embedding_padding_modules = embedding_padding_modules
+        # Lazily initialized by create_lora_manager.
+        self._lora_manager: LoRAModelManager
         super().__init__(max_num_seqs, max_num_batched_tokens, vocab_size,
                          lora_config, device)
 
@@ -104,7 +105,7 @@ def create_lora_manager(
             lora_config=self.lora_config,
             lora_manager_cls=self._lora_manager_cls,
         )
-        self._lora_manager: LoRAModelManager = lora_manager
+        self._lora_manager = lora_manager
         return lora_manager.model
 
     def set_active_loras(self, lora_requests: Set[LoRARequest],
@@ -188,7 +189,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self._lora_manager.remove_lora(lora_id)
 
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         self._lora_manager.remove_all_loras()
 
     def list_loras(self) -> Set[int]:
@@ -217,10 +218,10 @@ def create_lora_manager(
             lora_config=self.lora_config,
             max_num_batched_tokens=self.max_num_batched_tokens,
         )
-        self._lora_manager: LRUCacheLoRAModelManager = lora_manager
+        self._lora_manager = lora_manager
         return lora_manager.model
 
-    def _apply_loras(self, lora_requests: List[LoRARequest]) -> None:
+    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
         loras_map = {
             lora_request.lora_int_id: lora_request
             for lora_request in lora_requests if lora_request
@@ -237,12 +238,14 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
         if lora_request.lora_int_id not in self.list_loras():
             # Remove before we load the new lora to save memory
             if len(self._lora_manager) + 1 > self._lora_manager.capacity:
+                assert isinstance(self._lora_manager, LRUCacheLoRAModelManager)
                 self._lora_manager.remove_oldest_lora()
             lora = self._load_lora(lora_request)
             loaded = self._lora_manager.add_lora(lora)
         else:
             # If the lora is already loaded, just touch it to
             # update its position in the caches
-            loaded = self._lora_manager.get_lora(lora_request.lora_int_id)
+            loaded = self._lora_manager.get_lora(
+                lora_request.lora_int_id) is not None
         self._lora_manager.activate_lora(lora_request.lora_int_id)
         return loaded
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 13e20d8524a1b..65996f1710a8a 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -930,10 +930,10 @@ def profile_run(self) -> None:
         torch.cuda.synchronize()
         return
 
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_all_loras()
+        self.lora_manager.remove_all_loras()
 
     def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:

From 1917d8658f60b7691c6b0b9af27b48c97540d2f3 Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Fri, 26 Apr 2024 03:43:32 +0800
Subject: [PATCH 098/106] [Misc] Fix flash attention backend log  (#4368)

---
 vllm/attention/selector.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 554e802cd5513..7cc17f21dcd0e 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -25,7 +25,7 @@ class _Backend(enum.Enum):
 def get_attn_backend(dtype: torch.dtype) -> Type[AttentionBackend]:
     backend = _which_attn_to_use(dtype)
     if backend == _Backend.FLASH_ATTN:
-        logger.info("Using FlashAttention backend.")
+        logger.info("Using FlashAttention-2 backend.")
         from vllm.attention.backends.flash_attn import (  # noqa: F401
             FlashAttentionBackend)
         return FlashAttentionBackend
@@ -62,12 +62,12 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
     # NVIDIA GPUs.
     if torch.cuda.get_device_capability()[0] < 8:
         # Volta and Turing NVIDIA GPUs.
-        logger.info("Cannot use FlashAttention backend for Volta and Turing "
+        logger.info("Cannot use FlashAttention-2 backend for Volta and Turing "
                     "GPUs.")
         return _Backend.XFORMERS
 
     if dtype not in (torch.float16, torch.bfloat16):
-        logger.info("Cannot use FlashAttention backend for dtype other than "
+        logger.info("Cannot use FlashAttention-2 backend for dtype other than "
                     "torch.float16 or torch.bfloat16.")
         return _Backend.XFORMERS
 
@@ -75,8 +75,8 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
         import flash_attn  # noqa: F401
     except ImportError:
         logger.info(
-            "Cannot use FlashAttention backend because the flash_attn package "
-            "is not found. Please install it for better performance.")
+            "Cannot use FlashAttention-2 backend because the flash_attn "
+            "package is not found. Please install it for better performance.")
         return _Backend.XFORMERS
 
     backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)

From b6d61b2f44102528e98841d214d80e664a907cb8 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 26 Apr 2024 22:26:12 +0000
Subject: [PATCH 099/106] ./format, fixed tests failing in automation due to
 ray.init()

---
 .github/scripts/run-tests                     |  6 ++++-
 setup.py                                      | 12 +++-------
 .../test_basic_distributed_correctness.py     |  8 ++++---
 .../test_chunked_prefill_distributed.py       |  7 +++---
 vllm/core/scheduler.py                        | 21 -----------------
 vllm/engine/arg_utils.py                      | 23 ++++++++++++++-----
 6 files changed, 34 insertions(+), 43 deletions(-)

diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
index d0d28cdc9a0aa..82c08c3f16b04 100755
--- a/.github/scripts/run-tests
+++ b/.github/scripts/run-tests
@@ -31,7 +31,7 @@ while getopts "hs:t:r:f:" OPT; do
         ;;
 	t)
 	    TEST_DIR="${OPTARG}"
-	    ;;
+	    ;;x
     r)
         RESULTS_DIR="${OPTARG}"
         ;;
@@ -115,6 +115,10 @@ do
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"test_basic_distributed_correctness"* ]]; then
         CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"test_chunked_prefill_correctness"* ]]; then
+        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
         pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     else
diff --git a/setup.py b/setup.py
index c755bf8f94067..e4c4773e9dca7 100644
--- a/setup.py
+++ b/setup.py
@@ -380,14 +380,6 @@ def _read_requirements(filename: str) -> List[str]:
 # UPSTREAM SYNC: needed for sparsity
 _sparsity_deps = ["nm-magic-wand-nightly"]
 
-
-def get_extra_requirements() -> dict:
-    return {
-        "sparse": _sparsity_deps,
-        "sparsity": _sparsity_deps,
-    }
-
-
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
@@ -427,10 +419,12 @@ def get_extra_requirements() -> dict:
                                     "tests")),
     python_requires=">=3.8",
     install_requires=get_requirements(),
-    extras_require=get_extra_requirements(),
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer==2.9.0a1"],
+        # UPSTREAM SYNC: required for sparsity
+        "sparse": _sparsity_deps,
+        "sparsity": _sparsity_deps,
     },
     cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
     package_data=package_data,
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 77aa90b12bf8f..4021dba90ee0a 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -10,13 +10,15 @@
     test_basic_distributed_correctness.py
 ```
 """
-import os
-
+# UPSTREAM SYNC: We can only run one model per invocation of the test.
+#   Otherwise, we have duplicate ray.init() calls which fails.
+#   Rather than ruining .github/scripts/run-tests to pass via env
+#   variables, we just run llama which is sufficient for smoke test.
 import pytest
 import torch
 
 MODELS = [
-    os.environ["TEST_DIST_MODEL"],
+    "meta-llama/Llama-2-7b-hf",
 ]
 
 
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index eeac0ae9e899a..209d03084c3e5 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -11,13 +11,14 @@
     test_chunked_prefill_distributed.py
 ```
 """
-
+# UPSTREAM SYNC: We can only run one model per invocation of the test.
+#   Otherwise, we have duplicate ray.init() calls which fails.
+#   Rather than ruining .github/scripts/run-tests to pass via env
+#   variables, we just run llama which is sufficient for smoke test.
 import pytest
 import torch
 
-# UPSTREAM SYNC: our automation does not call via env variables.
 MODELS = [
-    "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index fe6b29d17d753..99f7a34d336a4 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -883,27 +883,6 @@ def _can_swap_in(self, seq_group: SequenceGroup) -> bool:
             num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
         )
 
-    def _can_append_slots(self, seq_group: SequenceGroup) -> bool:
-        """Determine whether or not we have enough space in the KV cache to
-        continue generation of the sequence group.
-        """
-        # Appending slots only occurs in decoding.
-        is_prefill = False
-
-        return self.block_manager.can_append_slots(
-            seq_group=seq_group,
-            num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
-        )
-
-    def _can_swap_in(self, seq_group: SequenceGroup) -> bool:
-        # Swapping in is considered decode.
-        is_prefill = False
-
-        return self.block_manager.can_swap_in(
-            seq_group=seq_group,
-            num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
-        )
-
     def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
         # Schedule sequence groups.
         # This function call changes the internal states of the scheduler
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 330681e863d4e..ae623b6f2600a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -477,13 +477,24 @@ def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
     def create_engine_config(self, ) -> EngineConfig:
         device_config = DeviceConfig(self.device)
         model_config = ModelConfig(
-            self.model, self.tokenizer, self.tokenizer_mode,
-            self.trust_remote_code, self.dtype, self.seed, self.revision,
-            self.code_revision, self.tokenizer_revision, self.max_model_len,
+            self.model,
+            self.tokenizer,
+            self.tokenizer_mode,
+            self.trust_remote_code,
+            self.dtype,
+            self.seed,
+            self.revision,
+            self.code_revision,
+            self.tokenizer_revision,
+            self.max_model_len,
             # UPSTREAM SYNC: keep sparsity argument
-            self.quantization, self.quantization_param_path, self.sparsity,
-            self.enforce_eager,self.max_context_len_to_capture,
-            self.max_logprobs, self.skip_tokenizer_init)
+            self.quantization,
+            self.quantization_param_path,
+            self.sparsity,
+            self.enforce_eager,
+            self.max_context_len_to_capture,
+            self.max_logprobs,
+            self.skip_tokenizer_init)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,

From 6dcf1813e3af97682139e874d9da89ba668f9f61 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 11:16:59 +0000
Subject: [PATCH 100/106] fixed typo in run tests script

---
 .github/scripts/run-tests | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
index 82c08c3f16b04..046059e4088d3 100755
--- a/.github/scripts/run-tests
+++ b/.github/scripts/run-tests
@@ -31,7 +31,7 @@ while getopts "hs:t:r:f:" OPT; do
         ;;
 	t)
 	    TEST_DIR="${OPTARG}"
-	    ;;x
+	    ;;
     r)
         RESULTS_DIR="${OPTARG}"
         ;;
@@ -113,12 +113,8 @@ do
     # need to be run with specific options
     if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
-    elif [[ "${TEST}" == *"test_basic_distributed_correctness"* ]]; then
-        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
-        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
-    elif [[ "${TEST}" == *"test_chunked_prefill_correctness"* ]]; then
-        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
-        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"distributed"* ]]; then
+        CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
         pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     else

From a8b853a88299ac207b16b5ebc196f7c880b1b1de Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 30 Apr 2024 00:32:34 +0000
Subject: [PATCH 101/106] fixed sparsity issues with model loader refactor

---
 tests/conftest.py                             |   8 +
 tests/models/test_compressed.py               |  18 +--
 tests/models/test_compressed_memory.py        |  19 +--
 .../sparsity/sparse_w16a16_linear_method.py   |   5 +-
 vllm/model_executor/model_loader.py           | 147 ------------------
 vllm/model_executor/model_loader/loader.py    |  25 ++-
 .../model_loader/weight_utils.py              |   2 +-
 7 files changed, 51 insertions(+), 173 deletions(-)
 delete mode 100644 vllm/model_executor/model_loader.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 98ae0404b4426..43e08f0fc33fa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -346,6 +346,10 @@ def generate_greedy_logprobs_nm(
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
 
+    def __del__(self):
+        del self.model
+        cleanup()
+
 
 @pytest.fixture
 def hf_runner_nm():
@@ -533,6 +537,10 @@ def generate_greedy_logprobs(
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
 
+    def __del__(self):
+        del self.model
+        cleanup()
+
 
 @pytest.fixture
 def vllm_runner_nm():
diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py
index a72d01d8956ed..5b4446fcbd9d2 100644
--- a/tests/models/test_compressed.py
+++ b/tests/models/test_compressed.py
@@ -26,7 +26,7 @@
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
-    vllm_runner_nm,
+    vllm_runner,
     example_prompts,
     model_format_pairs,
     dtype: str,
@@ -35,20 +35,20 @@ def test_models(
 ) -> None:
     model_name, sparsity = model_format_pairs
 
-    sparse_model = vllm_runner_nm(model_name=model_name,
-                                  sparsity=sparsity,
-                                  dtype=dtype,
-                                  max_model_len=MAX_MODEL_LEN)
+    sparse_model = vllm_runner(model_name=model_name,
+                               sparsity=sparsity,
+                               dtype=dtype,
+                               max_model_len=MAX_MODEL_LEN)
     sparse_outputs = sparse_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
     del sparse_model
     gc.collect()
 
-    dense_model = vllm_runner_nm(model_name=model_name,
-                                 sparsity=None,
-                                 dtype=dtype,
-                                 max_model_len=MAX_MODEL_LEN)
+    dense_model = vllm_runner(model_name=model_name,
+                              sparsity=None,
+                              dtype=dtype,
+                              max_model_len=MAX_MODEL_LEN)
     dense_outputs = dense_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py
index 15e6824b05b3b..ab8a08f7a58dd 100644
--- a/tests/models/test_compressed_memory.py
+++ b/tests/models/test_compressed_memory.py
@@ -22,21 +22,17 @@
 
 @pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS)
 @pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [3])
 def test_models(
-    vllm_runner_nm,
-    example_prompts,
+    vllm_runner,
     model_format_extrablocks,
     dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
 ) -> None:
     model_name, sparsity, num_extra_blocks = model_format_extrablocks
-    dense_model = vllm_runner_nm(model_name=model_name,
-                                 sparsity=None,
-                                 dtype=dtype,
-                                 max_model_len=1024)
+    dense_model = vllm_runner(model_name=model_name,
+                              enforce_eager=True,
+                              sparsity=None,
+                              dtype=dtype,
+                              max_model_len=1024)
     dense_gpu_alloc = (
         dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator)
     dense_num_kv_blocks = dense_gpu_alloc.num_blocks
@@ -45,8 +41,9 @@ def test_models(
     torch.cuda.empty_cache()
     gc.collect()
 
-    sparse_model = vllm_runner_nm(
+    sparse_model = vllm_runner(
         model_name=model_name,
+        enforce_eager=True,
         sparsity=sparsity,
         dtype=dtype,
         max_model_len=1024,
diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
index 77244fa399f47..291dd5e9faa52 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
@@ -1,4 +1,4 @@
-from typing import Optional, Type
+from typing import List, Optional, Type
 
 import torch
 import torch.nn.functional as F
@@ -31,13 +31,14 @@ def create_weights(
         self,
         layer: torch.nn.Module,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_partition_sizes: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
         del input_size, output_size  # Unused.
+        output_size_per_partition = sum(output_partition_sizes)
 
         supports_linear = (self.storage_format_cls !=
                            SparseBEGemmStorageFormat)
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
deleted file mode 100644
index 016689cefe850..0000000000000
--- a/vllm/model_executor/model_loader.py
+++ /dev/null
@@ -1,147 +0,0 @@
-"""Utilities for selecting and loading models."""
-import contextlib
-from typing import Tuple, Type
-
-import torch
-from torch import nn
-
-from vllm.config import DeviceConfig, ModelConfig
-from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.models.llava import LlavaForConditionalGeneration
-from vllm.model_executor.tensorizer_loader import (
-    ParameterizedLoadFormat, is_vllm_serialized_tensorizer,
-    load_with_tensorizer)
-from vllm.model_executor.weight_utils import (get_quant_config,
-                                              get_sparse_config,
-                                              initialize_dummy_weights)
-
-_VISION_MODEL_CLASSES = [
-    LlavaForConditionalGeneration,
-]
-
-
-@contextlib.contextmanager
-def _set_default_torch_dtype(dtype: torch.dtype):
-    """Sets the default torch dtype to the given dtype."""
-    old_dtype = torch.get_default_dtype()
-    torch.set_default_dtype(dtype)
-    yield
-    torch.set_default_dtype(old_dtype)
-
-
-def _get_model_architecture(
-        model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
-    architectures = getattr(model_config.hf_config, "architectures", [])
-    # Special handling for quantized Mixtral.
-    # FIXME(woosuk): This is a temporary hack.
-    if (model_config.quantization is not None
-            and "MixtralForCausalLM" in architectures):
-        architectures = ["QuantMixtralForCausalLM"]
-
-    for arch in architectures:
-        model_cls = ModelRegistry.load_model_cls(arch)
-        if model_cls is not None:
-            return (model_cls, arch)
-    raise ValueError(
-        f"Model architectures {architectures} are not supported for now. "
-        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
-
-
-def get_architecture_class_name(model_config: ModelConfig) -> str:
-    return _get_model_architecture(model_config)[1]
-
-
-def get_model(model_config: ModelConfig, device_config: DeviceConfig,
-              **kwargs) -> nn.Module:
-    lora_config = kwargs.get("lora_config", None)
-    vision_language_config = kwargs.get("vision_language_config", None)
-    tensorizer_config = kwargs.get("tensorizer_config", None)
-    model_class = _get_model_architecture(model_config)[0]
-
-    # Get the (maybe sparse or quantized) linear method.
-    linear_method = None
-    if model_config.quantization is not None:
-        quant_config = get_quant_config(model_config)
-        capability = torch.cuda.get_device_capability()
-        capability = capability[0] * 10 + capability[1]
-        if capability < quant_config.get_min_capability():
-            raise ValueError(
-                f"The quantization method {model_config.quantization} is not "
-                "supported for the current GPU. "
-                f"Minimum capability: {quant_config.get_min_capability()}. "
-                f"Current capability: {capability}.")
-        supported_dtypes = quant_config.get_supported_act_dtypes()
-        if model_config.dtype not in supported_dtypes:
-            raise ValueError(
-                f"{model_config.dtype} is not supported for quantization "
-                f"method {model_config.quantization}. Supported dtypes: "
-                f"{supported_dtypes}")
-
-        linear_method = quant_config.get_linear_method()
-    # UPSTREAM SYNC: needed to support sparsity
-    if model_config.sparsity is not None:
-        sparse_config = get_sparse_config(model_config)
-        capability = torch.cuda.get_device_capability()
-        capability = capability[0] * 10 + capability[1]
-        if capability < sparse_config.get_min_capability():
-            raise ValueError(
-                f"The sparsity method {model_config.sparsity} is not "
-                "supported for the current GPU. "
-                f"Minimum capability: {sparse_config.get_min_capability()}. "
-                f"Current capability: {capability}.")
-        supported_dtypes = sparse_config.get_supported_act_dtypes()
-        if model_config.dtype not in supported_dtypes:
-            raise ValueError(
-                f"{model_config.dtype} is not supported for sparsity "
-                f"method {model_config.sparsity}. Supported dtypes: "
-                f"{supported_dtypes}")
-        linear_method = sparse_config.get_linear_method()
-
-    with _set_default_torch_dtype(model_config.dtype):
-        # Create a model instance.
-        # The weights will be initialized as empty tensors.
-        extra_kwargs = {}
-        if hasattr(model_class, "supported_lora_modules"):
-            extra_kwargs["lora_config"] = lora_config
-        elif lora_config:
-            raise ValueError(
-                f"Model {model_class.__name__} does not support LoRA, "
-                "but LoRA is enabled. Support for this model may "
-                "be added in the future. If this is important to you, "
-                "please open an issue on github.")
-        elif model_class in _VISION_MODEL_CLASSES:
-            extra_kwargs["vision_language_config"] = vision_language_config
-
-        with torch.device(device_config.device):
-            if (model_config.load_format == "tensorizer"
-                    and is_vllm_serialized_tensorizer(tensorizer_config)):
-                extra_kwargs["linear_method"] = linear_method
-                tensorizer_config.model_class = model_class
-                tensorizer_config.hf_config = model_config.hf_config
-                tensorizer_config.dtype = model_config.dtype
-                model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
-                return model.eval()
-            model = model_class(config=model_config.hf_config,
-                                linear_method=linear_method,
-                                **extra_kwargs)
-        if model_config.load_format == "dummy":
-            # NOTE(woosuk): For accurate performance evaluation, we assign
-            # random values to the weights.
-            initialize_dummy_weights(model)
-        else:
-            # Load the weights from the cached or downloaded files.
-            if model_config.load_format == "tensorizer":
-                # Provide a dynamic load format for `model.load_weights`
-                # to retain tensorizer args from CLI.
-                model_config.load_format = ParameterizedLoadFormat(
-                    model_config.load_format)
-                model_config.load_format.params = (
-                    tensorizer_config._construct_tensorizer_args())
-
-            model.load_weights(
-                model_config.model,
-                model_config.download_dir,
-                model_config.load_format,
-                model_config.revision,
-            )
-    return model.eval()
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index f75c35a69d4a9..a392f4675b436 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -18,10 +18,11 @@
     tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     set_default_torch_dtype)
-from vllm.model_executor.model_loader.weight_utils import (
+from vllm.model_executor.model_loader.weight_utils import (  # UPSTREAM SYNC: needed for sparsity
     download_weights_from_hf, filter_files_not_needed_for_inference,
-    get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
-    pt_weights_iterator, safetensors_weights_iterator)
+    get_quant_config, get_sparse_config, initialize_dummy_weights,
+    np_cache_weights_iterator, pt_weights_iterator,
+    safetensors_weights_iterator)
 from vllm.model_executor.models.llava import LlavaForConditionalGeneration
 
 if TYPE_CHECKING:
@@ -57,6 +58,24 @@ def _get_linear_method(
                 f"{supported_dtypes}")
 
         linear_method = quant_config.get_linear_method()
+
+    if model_config.sparsity is not None:
+        sparse_config = get_sparse_config(model_config)
+        capability = torch.cuda.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        if capability < sparse_config.get_min_capability():
+            raise ValueError(
+                f"The sparsity method {model_config.sparsity} is not "
+                "supported for the current GPU. "
+                f"Minimum capability: {sparse_config.get_min_capability()}. "
+                f"Current capability: {capability}.")
+        supported_dtypes = sparse_config.get_supported_act_dtypes()
+        if model_config.dtype not in supported_dtypes:
+            raise ValueError(
+                f"{model_config.dtype} is not supported for sparsity "
+                f"method {model_config.sparsity}. Supported dtypes: "
+                f"{supported_dtypes}")
+        linear_method = sparse_config.get_linear_method()
     return linear_method
 
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 46f07205aceab..aec58033fa1d3 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -117,7 +117,7 @@ def convert_bin_to_safetensor_file(
 
 # UPSTREAM SYNC: needed for sparsity
 # TODO: (MLE) load compressed models from here
-def get_sparse_config(model_config: ModelConfig):
+def get_sparse_config(model_config: ModelConfig) -> QuantizationConfig:
     from vllm.model_executor.layers.sparsity import get_sparsity_config
     sparsity_cls = get_sparsity_config(model_config.sparsity)
     hf_sparsity_config = getattr(model_config.hf_config, "sparsity_config",

From b7fb44b0f105fbe41d379232f7942e55a93f5f94 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 30 Apr 2024 00:38:20 +0000
Subject: [PATCH 102/106] format

---
 vllm/model_executor/model_loader/loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a392f4675b436..6829fddb219fd 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -18,8 +18,9 @@
     tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     set_default_torch_dtype)
-from vllm.model_executor.model_loader.weight_utils import (  # UPSTREAM SYNC: needed for sparsity
+from vllm.model_executor.model_loader.weight_utils import (
     download_weights_from_hf, filter_files_not_needed_for_inference,
+    # UPSTREAM SYNC: needed for sparsity
     get_quant_config, get_sparse_config, initialize_dummy_weights,
     np_cache_weights_iterator, pt_weights_iterator,
     safetensors_weights_iterator)

From 8177a4bcc0bfbf2d320b0c736913d7d71c97d195 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 30 Apr 2024 00:41:15 +0000
Subject: [PATCH 103/106] linter

---
 vllm/model_executor/model_loader/loader.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 6829fddb219fd..a392f4675b436 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -18,9 +18,8 @@
     tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     set_default_torch_dtype)
-from vllm.model_executor.model_loader.weight_utils import (
+from vllm.model_executor.model_loader.weight_utils import (  # UPSTREAM SYNC: needed for sparsity
     download_weights_from_hf, filter_files_not_needed_for_inference,
-    # UPSTREAM SYNC: needed for sparsity
     get_quant_config, get_sparse_config, initialize_dummy_weights,
     np_cache_weights_iterator, pt_weights_iterator,
     safetensors_weights_iterator)

From 96219f1ed0a8baf8c1225a9600eb24b488d06f34 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 30 Apr 2024 00:43:00 +0000
Subject: [PATCH 104/106] ruff ruff

---
 vllm/model_executor/model_loader/loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a392f4675b436..ea0067b462dfd 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -18,7 +18,8 @@
     tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     set_default_torch_dtype)
-from vllm.model_executor.model_loader.weight_utils import (  # UPSTREAM SYNC: needed for sparsity
+# UPSTREAM SYNC: needed for sparsity
+from vllm.model_executor.model_loader.weight_utils import (
     download_weights_from_hf, filter_files_not_needed_for_inference,
     get_quant_config, get_sparse_config, initialize_dummy_weights,
     np_cache_weights_iterator, pt_weights_iterator,

From 16f1aa236325f9d4e9ba58c7480c8605f1335ac6 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 30 Apr 2024 00:54:49 +0000
Subject: [PATCH 105/106] updated tests to skip starcoder for now

---
 tests/models/test_models_logprobs.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py
index 9818a9db62f5a..e515dd88b1317 100644
--- a/tests/models/test_models_logprobs.py
+++ b/tests/models/test_models_logprobs.py
@@ -18,14 +18,20 @@
     "EleutherAI/gpt-j-6b",
     "EleutherAI/pythia-1b",
     "bigscience/bloom-1b1",
-    # "mosaicml/mpt-7b",                # vLLM upstream bug in mpt right now  # noqa
+    "mosaicml/mpt-7b",
     "microsoft/phi-2",
     "stabilityai/stablelm-3b-4e1t",
-    # "allenai/OLMo-1B",                # dependencies are not installed right now  # noqa
+    "allenai/OLMo-1B",
     "bigcode/starcoder2-3b",
     "Qwen/Qwen1.5-0.5B",
 ]
 
+SKIPPED_MODELS = [
+    "mosaicml/mpt-7b",
+    "allenai/OLMo-1B",
+    "bigcode/starcoder2-3b",
+]
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@@ -40,6 +46,11 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
+    if model in SKIPPED_MODELS:
+        pytest.skip(
+            reason="Low priority models not currently passing. "
+                    "We need to re-enable these.")
+
     hf_model = hf_runner_nm(model, dtype=dtype)
     hf_outputs = hf_model.generate_greedy_logprobs_nm(example_prompts,
                                                       max_tokens, num_logprobs)

From 475ec0a9f8c68d9ae2310119717ee9bb00f3a5f8 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 30 Apr 2024 00:56:50 +0000
Subject: [PATCH 106/106] yapf

---
 tests/models/test_models_logprobs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py
index e515dd88b1317..8081c22442f95 100644
--- a/tests/models/test_models_logprobs.py
+++ b/tests/models/test_models_logprobs.py
@@ -47,8 +47,7 @@ def test_models(
     num_logprobs: int,
 ) -> None:
     if model in SKIPPED_MODELS:
-        pytest.skip(
-            reason="Low priority models not currently passing. "
+        pytest.skip(reason="Low priority models not currently passing. "
                     "We need to re-enable these.")
 
     hf_model = hf_runner_nm(model, dtype=dtype)