From 0eb0757bef728c632be17e61a88cfa9dc1a760c4 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 23 Jul 2024 14:04:04 -0400
Subject: [PATCH 001/167] [Misc] Add ignored layers for `fp8` quantization
 (#6657)

---
 .../quantization/compressed_tensors/utils.py  | 14 +++----
 .../layers/quantization/fbgemm_fp8.py         | 39 ++-----------------
 .../model_executor/layers/quantization/fp8.py | 13 ++++++-
 .../layers/quantization/utils/quant_utils.py  | 38 ++++++++++++++++++
 4 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 7e8e70806a0fc..7912cbde5721f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -5,6 +5,9 @@
 from pydantic import BaseModel, Field
 from torch.nn import Module
 
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    FUSED_LAYER_NAME_MAPPING)
+
 
 class CompressionFormat(Enum):
     dense = "dense"
@@ -86,13 +89,6 @@ def is_activation_quantization_format(format: str) -> bool:
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
 
-# fused_name: List[shard_name]
-_FUSED_LAYER_NAME_MAPPING = {
-    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    "gate_up_proj": ["gate_proj", "up_proj"]
-}
-
-
 def should_ignore_layer(layer_name: Optional[str],
                         ignore: Iterable[str]) -> bool:
     if layer_name is None:
@@ -106,8 +102,8 @@ def should_ignore_layer(layer_name: Optional[str],
     # in the safetensors checkpoint. So, we convert the name
     # from the fused version to unfused + check to make sure that
     # each shard of the fused layer has the same scheme.
-    if proj_name in _FUSED_LAYER_NAME_MAPPING:
-        shard_proj_names = _FUSED_LAYER_NAME_MAPPING[proj_name]
+    if proj_name in FUSED_LAYER_NAME_MAPPING:
+        shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
 
         # Convert fused_name --> [shard_names]
         shard_names = [
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 6b329231ec3af..5e8d1f1947421 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -11,6 +11,8 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     apply_fp8_linear, create_per_channel_scale_param)
 from vllm.model_executor.utils import set_weight_attrs
@@ -18,14 +20,6 @@
 
 logger = init_logger(__name__)
 
-# Note: this is a hack. We should update each model to register the
-# stacked params and get it from there instead in a future PR.
-# fused_name: List[shard_name]
-_FUSED_LAYER_NAME_MAPPING = {
-    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    "gate_up_proj": ["gate_proj", "up_proj"]
-}
-
 
 class FBGEMMFp8Config(QuantizationConfig):
     """Config class for FBGEMM Fp8."""
@@ -62,37 +56,10 @@ def from_config(cls, config: Dict[str, Any]) -> "FBGEMMFp8Config":
         input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"])
         return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
 
-    def _is_layer_skipped(self, prefix: str) -> bool:
-        # prefix: model.layers.0.self_attn.q_proj
-        # proj_name: q_proj
-        proj_name = prefix.split(".")[-1]
-        if proj_name in _FUSED_LAYER_NAME_MAPPING:
-            shard_prefixes = [
-                prefix.replace(proj_name, shard_proj_name)
-                for shard_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name]
-            ]
-
-            is_skipped = None
-            for shard_prefix in shard_prefixes:
-                is_shard_skipped = shard_prefix in self.ignore_list
-
-                if is_skipped is None:
-                    is_skipped = is_shard_skipped
-                elif is_shard_skipped != is_skipped:
-                    raise ValueError(
-                        f"Detected some but not all shards of {prefix} "
-                        "are quantized. All shards of fused layers "
-                        "to have the same precision.")
-        else:
-            is_skipped = prefix in self.ignore_list
-
-        assert is_skipped is not None
-        return is_skipped
-
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
-            if self._is_layer_skipped(prefix):
+            if is_layer_skipped(prefix, self.ignore_list):
                 return UnquantizedLinearMethod()
             return FBGEMMFp8LinearMethod(self)
         return None
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index b2a1b0a9534e8..3a4f2a49a3497 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -8,12 +8,15 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   fused_moe)
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, apply_fp8_linear, create_per_tensor_scale_param,
     cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale)
@@ -33,6 +36,7 @@ def __init__(
         self,
         is_checkpoint_fp8_serialized: bool = False,
         activation_scheme: str = "dynamic",
+        ignored_layers: Optional[List[str]] = None,
     ) -> None:
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         if is_checkpoint_fp8_serialized:
@@ -42,6 +46,7 @@ def __init__(
             raise ValueError(
                 f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
 
     @classmethod
     def get_name(cls) -> str:
@@ -64,14 +69,18 @@ def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         quant_method = cls.get_from_keys(config, ["quant_method"])
         is_checkpoint_fp8_serialized = ("fp8" in quant_method)
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
         return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
-                   activation_scheme=activation_scheme)
+                   activation_scheme=activation_scheme,
+                   ignored_layers=ignored_layers)
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return Fp8MoEMethod(self)
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 7abe919f859ca..2ba6a9a810ec0 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,10 +1,48 @@
 """This file is used for /tests and /benchmarks"""
+from typing import List
+
 import numpy
 import torch
 
 SUPPORTED_NUM_BITS = [4, 8]
 SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
+# Note: this is a hack. We should update each model to register the
+# stacked params and get it from there instead in a future PR.
+# fused_name: List[shard_name]
+FUSED_LAYER_NAME_MAPPING = {
+    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    "gate_up_proj": ["gate_proj", "up_proj"]
+}
+
+
+def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
+    # prefix: model.layers.0.self_attn.q_proj
+    # proj_name: q_proj
+    proj_name = prefix.split(".")[-1]
+    if proj_name in FUSED_LAYER_NAME_MAPPING:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name]
+        ]
+
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = shard_prefix in ignored_layers
+
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision.")
+    else:
+        is_skipped = prefix in ignored_layers
+
+    assert is_skipped is not None
+    return is_skipped
+
 
 def get_pack_factor(num_bits):
     assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"

From 58f53034add8767c9e5d92431220faa409fa3dc2 Mon Sep 17 00:00:00 2001
From: Yehoshua Cohen <61619195+yecohn@users.noreply.github.com>
Date: Tue, 23 Jul 2024 21:41:55 +0300
Subject: [PATCH 002/167] [Frontend] Add Usage data in each chunk for
 chat_serving. #6540 (#6652)

---
 tests/entrypoints/openai/test_chat.py   | 40 ++++++++++++++++----
 vllm/entrypoints/openai/serving_chat.py | 50 +++++++++++++++++++++++--
 2 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 1abaa01ae192a..c96d602b63438 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -295,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
     async for chunk in stream:
         assert chunk.usage is None
 
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
+    # Test stream=True, stream_options={"include_usage": True,
+    #                                   "continuous_usage_stats": False}}
+    stream = await client.chat.completions.create(model=model_name,
+                                                  messages=messages,
+                                                  max_tokens=10,
+                                                  temperature=0.0,
+                                                  stream=True,
+                                                  stream_options={
+                                                      "include_usage":
+                                                      True,
+                                                      "continuous_usage_stats":
+                                                      False
+                                                  })
 
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
@@ -338,6 +343,25 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
             stream=False,
             stream_options={"include_usage": True})
 
+    # Test stream=True, stream_options={"include_usage": True,
+    #                           "continuous_usage_stats": True}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True
+        },
+    )
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+
 
 # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
 # (i.e. using the same ordering as in the Completions API tests), the test
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index b21c2bc513186..3899509ef3ff4 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -247,7 +247,15 @@ async def chat_completion_stream_generator(
                             model=model_name)
                         if (request.stream_options
                                 and request.stream_options.include_usage):
-                            chunk.usage = None
+                            if (request.stream_options.continuous_usage_stats):
+                                prompt_tokens = len(res.prompt_token_ids)
+                                usage = UsageInfo(prompt_tokens=prompt_tokens,
+                                                  completion_tokens=0,
+                                                  total_tokens=prompt_tokens)
+                                chunk.usage = usage
+                            else:
+                                chunk.usage = None
+
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
 
@@ -277,7 +285,18 @@ async def chat_completion_stream_generator(
                                     model=model_name)
                                 if (request.stream_options and
                                         request.stream_options.include_usage):
-                                    chunk.usage = None
+                                    if (request.stream_options.
+                                            continuous_usage_stats):
+                                        prompt_tokens = len(
+                                            res.prompt_token_ids)
+                                        usage = UsageInfo(
+                                            prompt_tokens=prompt_tokens,
+                                            completion_tokens=0,
+                                            total_tokens=prompt_tokens)
+                                        chunk.usage = usage
+                                    else:
+                                        chunk.usage = None
+
                                 data = chunk.model_dump_json(
                                     exclude_unset=True)
                                 yield f"data: {data}\n\n"
@@ -336,7 +355,19 @@ async def chat_completion_stream_generator(
                             model=model_name)
                         if (request.stream_options
                                 and request.stream_options.include_usage):
-                            chunk.usage = None
+                            if (request.stream_options.continuous_usage_stats):
+                                prompt_tokens = len(res.prompt_token_ids)
+                                completion_tokens = len(output.token_ids)
+                                usage = UsageInfo(
+                                    prompt_tokens=prompt_tokens,
+                                    completion_tokens=completion_tokens,
+                                    total_tokens=prompt_tokens +
+                                    completion_tokens,
+                                )
+                                chunk.usage = usage
+                            else:
+                                chunk.usage = None
+
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                     else:
@@ -356,7 +387,18 @@ async def chat_completion_stream_generator(
                             model=model_name)
                         if (request.stream_options
                                 and request.stream_options.include_usage):
-                            chunk.usage = None
+                            if (request.stream_options.continuous_usage_stats):
+                                prompt_tokens = len(res.prompt_token_ids)
+                                completion_tokens = len(output.token_ids)
+                                usage = UsageInfo(
+                                    prompt_tokens=prompt_tokens,
+                                    completion_tokens=completion_tokens,
+                                    total_tokens=prompt_tokens +
+                                    completion_tokens,
+                                )
+                                chunk.usage = usage
+                            else:
+                                chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True

From 507ef787d85dec24490069ffceacbd6b161f4f72 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Tue, 23 Jul 2024 13:22:09 -0600
Subject: [PATCH 003/167] [Model] Pipeline Parallel Support for DeepSeek v2
 (#6519)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/config.py                            |   1 +
 vllm/model_executor/models/deepseek_v2.py | 153 ++++++++++++++++------
 2 files changed, 115 insertions(+), 39 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c27d26c098b59..6e0283f8379a2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -31,6 +31,7 @@
 _PP_SUPPORTED_MODELS = [
     "AquilaModel",
     "AquilaForCausalLM",
+    "DeepseekV2ForCausalLM",
     "InternLMForCausalLM",
     "LlamaForCausalLM",
     "LLaMAForCausalLM",
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 2d12ceb7f3dbf..2e3e9b6f2792e 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -29,7 +29,8 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import (get_tensor_model_parallel_world_size,
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -49,6 +50,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
 
 class DeepseekV2MLP(nn.Module):
 
@@ -59,17 +62,20 @@ def __init__(
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
             hidden_size, [intermediate_size] * 2,
             bias=False,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(intermediate_size,
                                            hidden_size,
                                            bias=False,
                                            quant_config=quant_config,
-                                           reduce_results=reduce_results)
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -88,6 +94,7 @@ def __init__(
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -112,12 +119,14 @@ def __init__(
                                 quant_config=quant_config,
                                 use_grouped_topk=True,
                                 num_expert_group=config.n_group,
-                                topk_group=config.topk_group)
+                                topk_group=config.topk_group,
+                                prefix=f"{prefix}.experts")
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.n_routed_experts,
                                      bias=False,
-                                     quant_config=None)
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
         if config.n_shared_experts is not None:
             intermediate_size = (config.moe_intermediate_size *
                                  config.n_shared_experts)
@@ -172,10 +181,9 @@ def __init__(
         max_position_embeddings: int = 8192,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
-        layer_idx=None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-        self.layer_idx = layer_idx
         self.hidden_size = hidden_size
         self.qk_nope_head_dim = qk_nope_head_dim
         self.qk_rope_head_dim = qk_rope_head_dim
@@ -195,38 +203,44 @@ def __init__(
             self.q_a_proj = ReplicatedLinear(self.hidden_size,
                                              self.q_lora_rank,
                                              bias=False,
-                                             quant_config=quant_config)
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
             self.q_a_layernorm = RMSNorm(self.q_lora_rank,
                                          eps=config.rms_norm_eps)
             self.q_b_proj = ColumnParallelLinear(q_lora_rank,
                                                  self.num_heads *
                                                  self.qk_head_dim,
                                                  bias=False,
-                                                 quant_config=quant_config)
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
         else:
             self.q_proj = ColumnParallelLinear(self.hidden_size,
                                                self.num_heads *
                                                self.qk_head_dim,
                                                bias=False,
-                                               quant_config=quant_config)
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
 
-        self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size,
-                                                   self.kv_lora_rank +
-                                                   self.qk_rope_head_dim,
-                                                   bias=False,
-                                                   quant_config=quant_config)
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
         self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
                                       eps=config.rms_norm_eps)
         self.kv_b_proj = ColumnParallelLinear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
             bias=False,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
         # O projection.
         self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
                                         self.hidden_size,
                                         bias=False,
-                                        quant_config=quant_config)
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
         rope_scaling['type'] = 'deepseek_yarn'
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
@@ -308,7 +322,7 @@ class DeepseekV2DecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        layer_idx: int,
+        prefix: str,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -318,6 +332,9 @@ def __init__(
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings",
                                           8192)
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep='.')[-1])
         self.self_attn = DeepseekV2Attention(
             config=config,
             hidden_size=self.hidden_size,
@@ -333,18 +350,23 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
-            layer_idx=layer_idx,
+            prefix=f"{prefix}.self_attn",
         )
         if (config.n_routed_experts is not None
                 and layer_idx >= config.first_k_dense_replace
                 and layer_idx % config.moe_layer_freq == 0):
-            self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config)
+            self.mlp = DeepseekV2MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
         else:
             self.mlp = DeepseekV2MLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
             )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -389,23 +411,34 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            DeepseekV2DecoderLayer(config,
-                                   layer_idx,
-                                   cache_config=cache_config,
-                                   quant_config=quant_config)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: DeepseekV2DecoderLayer(
+                config,
+                prefix,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ),
+            prefix=f"{prefix}.layers")
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
 
     def forward(
         self,
@@ -413,14 +446,28 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+        if get_pp_group().is_first_rank:
+            hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i], attn_metadata,
-                                            residual)
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -436,7 +483,10 @@ def __init__(
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.model = DeepseekV2Model(config, cache_config, quant_config)
+        self.model = DeepseekV2Model(config,
+                                     cache_config,
+                                     quant_config,
+                                     prefix="model")
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
@@ -452,7 +502,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -469,6 +519,20 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -504,6 +568,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -514,6 +582,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
@@ -527,6 +599,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     if name.endswith(".bias") and name not in params_dict:
                         continue
 
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)

From 1bedf210e35f5a11df5e9dd51e82b0663f854cf4 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 23 Jul 2024 13:47:48 -0700
Subject: [PATCH 004/167] Bump `transformers` version for Llama 3.1 hotfix and
 patch Chameleon  (#6690)

---
 requirements-common.txt                      |   2 +-
 tests/test_config.py                         |  53 +++----
 vllm/model_executor/models/__init__.py       |   2 -
 vllm/model_executor/models/chameleon.py      |   3 +-
 vllm/transformers_utils/config.py            |   9 +-
 vllm/transformers_utils/configs/__init__.py  |   4 -
 vllm/transformers_utils/configs/chameleon.py | 138 -------------------
 7 files changed, 33 insertions(+), 178 deletions(-)
 delete mode 100644 vllm/transformers_utils/configs/chameleon.py

diff --git a/requirements-common.txt b/requirements-common.txt
index 29643cfce161b..940740722c7e6 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.42.4  # Required for Gemma 2 and for additional chat template parameters.
+transformers >= 4.43.1  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
diff --git a/tests/test_config.py b/tests/test_config.py
index 6c8af9d7966b4..9f7d85e39ad67 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -64,9 +64,8 @@ def test_get_sliding_window():
 
 
 def test_rope_customization():
-    TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
+    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
     TEST_ROPE_THETA = 16_000_000.0
-    LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
@@ -96,27 +95,29 @@ def test_rope_customization():
                    None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
-    longchat_model_config = ModelConfig(
-        "lmsys/longchat-13b-16k",
-        "lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
-    assert getattr(longchat_model_config.hf_config, "rope_scaling",
-                   None) == LONGCHAT_ROPE_SCALING
-    assert longchat_model_config.max_model_len == 16384
-
-    longchat_model_config = ModelConfig(
-        "lmsys/longchat-13b-16k",
-        "lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-        rope_scaling=TEST_ROPE_SCALING,
-    )
-    assert getattr(longchat_model_config.hf_config, "rope_scaling",
-                   None) == TEST_ROPE_SCALING
-    assert longchat_model_config.max_model_len == 4096
+    # TODO: add these back when the rope configs are fixed
+    # LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
+    # longchat_model_config = ModelConfig(
+    #     "lmsys/longchat-13b-16k",
+    #     "lmsys/longchat-13b-16k",
+    #     tokenizer_mode="auto",
+    #     trust_remote_code=False,
+    #     dtype="float16",
+    #     seed=0,
+    # )
+    # assert getattr(longchat_model_config.hf_config, "rope_scaling",
+    #                None) == LONGCHAT_ROPE_SCALING
+    # assert longchat_model_config.max_model_len == 16384
+
+    # longchat_model_config = ModelConfig(
+    #     "lmsys/longchat-13b-16k",
+    #     "lmsys/longchat-13b-16k",
+    #     tokenizer_mode="auto",
+    #     trust_remote_code=False,
+    #     dtype="float16",
+    #     seed=0,
+    #     rope_scaling=TEST_ROPE_SCALING,
+    # )
+    # assert getattr(longchat_model_config.hf_config, "rope_scaling",
+    #                None) == TEST_ROPE_SCALING
+    # assert longchat_model_config.max_model_len == 4096
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 31370aebba599..55a039a88d535 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -16,8 +16,6 @@
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
-    #TODO(ywang96): remove this when huggingface fixes the model repo
-    "ChameleonForCausalLM": ("chameleon", "ChameleonForConditionalGeneration"),
     "ChameleonForConditionalGeneration":
     ("chameleon", "ChameleonForConditionalGeneration"),
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index d06eb0504079f..6ece95495a026 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -6,6 +6,7 @@
 import torch.nn.functional as F
 from PIL import Image
 from torch import nn
+from transformers import ChameleonConfig, ChameleonVQVAEConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
@@ -30,8 +31,6 @@
 from vllm.multimodal.image import (cached_get_tokenizer,
                                    repeat_and_pad_image_tokens)
 from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
-from vllm.transformers_utils.configs import (ChameleonConfig,
-                                             ChameleonVQVAEConfig)
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsVision
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f99bea356da88..652505a892142 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -5,10 +5,10 @@
 
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
-from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig,
-                                             DbrxConfig, JAISConfig,
-                                             MedusaConfig, MLPSpeculatorConfig,
-                                             MPTConfig, RWConfig)
+from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
+                                             JAISConfig, MedusaConfig,
+                                             MLPSpeculatorConfig, MPTConfig,
+                                             RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -18,7 +18,6 @@
 logger = init_logger(__name__)
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
-    "chameleon": ChameleonConfig,
     "chatglm": ChatGLMConfig,
     "dbrx": DbrxConfig,
     "mpt": MPTConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 080c0777ebdcc..51de11ca3e42a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,5 +1,3 @@
-from vllm.transformers_utils.configs.chameleon import (ChameleonConfig,
-                                                       ChameleonVQVAEConfig)
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
@@ -12,8 +10,6 @@
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 __all__ = [
-    "ChameleonConfig",
-    "ChameleonVQVAEConfig",
     "ChatGLMConfig",
     "DbrxConfig",
     "MPTConfig",
diff --git a/vllm/transformers_utils/configs/chameleon.py b/vllm/transformers_utils/configs/chameleon.py
deleted file mode 100644
index c1ac1182e14c4..0000000000000
--- a/vllm/transformers_utils/configs/chameleon.py
+++ /dev/null
@@ -1,138 +0,0 @@
-from typing import List, Optional
-
-from transformers import PretrainedConfig
-
-
-#TODO (ywang96): Remove this file and import it from
-# transformers once the new release with Chameleon support
-# is available.
-class ChameleonConfig(PretrainedConfig):
-    model_type = "chameleon"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=65536,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-05,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        model_parallel_size=1,
-        swin_norm=False,
-        vq_config=None,
-        vocabulary_map=None,
-        mlp_bias=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.mlp_bias = mlp_bias
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.model_parallel_size = model_parallel_size
-        self.swin_norm = swin_norm
-
-        if vq_config is None:
-            vq_config = {}
-
-        self.vq_config = ChameleonVQVAEConfig(**vq_config)
-
-        self.vocabulary_map = vocabulary_map
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling,
-                          dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, "
-                f"`type` and `factor`, got {self.rope_scaling}")
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in [
-                "linear", "dynamic"
-        ]:
-            raise ValueError(
-                "`rope_scaling`'s type field must be one of ['linear', "
-                f"'dynamic'], got {rope_scaling_type}")
-        if rope_scaling_factor is None or not isinstance(
-                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(
-                "`rope_scaling`'s factor field must be a float > 1, "
-                f"got {rope_scaling_factor}")
-
-
-class ChameleonVQVAEConfig(PretrainedConfig):
-
-    model_type = "chameleon_vqgan"
-
-    def __init__(
-        self,
-        embed_dim: int = 256,
-        num_embeddings: int = 8192,
-        double_latent: bool = False,
-        latent_channels: int = 256,
-        resolution: int = 512,
-        in_channels: int = 3,
-        base_channels: int = 128,
-        channel_multiplier: List[int] = [1, 1, 2, 2, 4],  #noqa
-        num_res_blocks: int = 2,
-        attn_resolutions: Optional[List[int]] = None,
-        dropout: float = 0.0,
-        attn_type: str = "vanilla",
-        initializer_range=0.02,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.embed_dim = embed_dim
-        self.num_embeddings = num_embeddings
-        self.double_latent = double_latent
-        self.latent_channels = latent_channels
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.base_channels = base_channels
-        self.channel_multiplier = channel_multiplier
-        self.num_res_blocks = num_res_blocks
-        self.attn_resolutions = attn_resolutions
-        self.dropout = dropout
-        self.attn_type = attn_type
-        self.initializer_range = initializer_range

From 72fc7048032a9a12f42c76e60c0d8ca3673f0692 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 23 Jul 2024 14:03:49 -0700
Subject: [PATCH 005/167] [build] relax wheel size limit (#6704)

---
 .buildkite/check-wheel-size.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 75ad094fa1382..b39dce2659a54 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,7 +1,7 @@
 import os
 import zipfile
 
-MAX_SIZE_MB = 200
+MAX_SIZE_MB = 250
 
 
 def print_top_10_largest_files(zip_file):

From 01c16ede6b4fc2e07b6eb5a4f60a64a1f365e460 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 23 Jul 2024 18:45:12 -0400
Subject: [PATCH 006/167] [CI] Add smoke test for non-uniform AutoFP8
 quantization (#6702)

---
 tests/quantization/test_fp8.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 82dc775f8d812..0602fedf0b8e3 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -13,6 +13,7 @@
 MODELS = [
     "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
     "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
 ]
 
 

From 2f808e69ab16a8775cd58849b15b465d4f11b92e Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tom.parnell@gmail.com>
Date: Wed, 24 Jul 2024 01:05:05 +0200
Subject: [PATCH 007/167] [Bugfix] StatLoggers: cache spec decode metrics when
 they get collected. (#6645)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/metrics/test_metrics.py | 91 +++++++++++++++++++++++++++++++++++
 vllm/engine/metrics.py        | 47 ++++++++++++------
 2 files changed, 122 insertions(+), 16 deletions(-)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 42b15cd6c458e..23a7a85580a0a 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,3 +1,4 @@
+import time
 from typing import List
 
 import pytest
@@ -10,6 +11,8 @@
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import cleanup
+
 MODELS = [
     "facebook/opt-125m",
 ]
@@ -219,6 +222,94 @@ def test_metric_spec_decode(
                 "does not meet expectation")
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
+def test_metric_spec_decode_interval(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    log_interval: int,
+) -> None:
+    k = 5
+
+    engine_args = EngineArgs(model=model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.4,
+                             speculative_model=model,
+                             num_speculative_tokens=k,
+                             use_v2_block_manager=True,
+                             enforce_eager=True)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+
+    try:
+
+        engine.add_request(
+            "request-id-0",
+            example_prompts[0],
+            SamplingParams(max_tokens=max_tokens),
+        )
+
+        # set log internal
+        stat_logger = engine.stat_loggers['prometheus']
+        stat_logger.local_interval = log_interval
+
+        # prefill
+        engine.step()
+
+        # wait for 5 seconds to ensure that spec decode metrics
+        # get triggered in first decode step
+        time.sleep(5)
+
+        # first decode step should trigger async collection of metrics
+        engine.step()
+
+        # wait one second to allow H2D transfer to finish
+        time.sleep(1)
+
+        # second decode step should now be able to collect the spec
+        # decode stats and the request should also be finished
+        engine.step()
+
+        # must have finisehd now
+        assert not engine.has_unfinished_requests()
+
+        # wait to ensure logging occurs
+        time.sleep(log_interval)
+
+        # force logging
+        engine.step()
+
+        # Note that the purpose of this test is to verify spec decode
+        # metrics instead of functional correctness, so the expected values
+        # are intended to be loose.
+        metric_name_to_expected_fn = {
+            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
+            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
+            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
+            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
+            "counter_spec_decode_num_emitted_tokens":
+            lambda v: 0 <= v <= k + 1,
+        }
+
+        for metric_name, is_expected in metric_name_to_expected_fn.items():
+            metric_val = getattr(
+                stat_logger.metrics,
+                metric_name).labels(**stat_logger.labels)._value.get()
+            assert is_expected(metric_val), (
+                f"the value of metric {metric_name} ({metric_val}) "
+                "does not meet expectation")
+
+    finally:
+        del engine
+        cleanup()
+
+
 def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
                    num_requests: int) -> None:
     if disable_log_stats:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 4ed7da2377111..2f105b9cd2fb6 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -355,6 +355,7 @@ def __init__(self, local_interval: float) -> None:
         self.num_generation_tokens: List[int] = []
         self.last_local_log = time.time()
         self.local_interval = local_interval
+        self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
 
     @abstractmethod
     def info(self, type: str, obj: SupportsMetricsInfo) -> None:
@@ -364,6 +365,12 @@ def info(self, type: str, obj: SupportsMetricsInfo) -> None:
     def log(self, stats: Stats) -> None:
         raise NotImplementedError
 
+    def maybe_update_spec_decode_metrics(self, stats: Stats):
+        """Save spec decode metrics (since they are unlikely
+        to be emitted at same time as log interval)."""
+        if stats.spec_decode_metrics is not None:
+            self.spec_decode_metrics = stats.spec_decode_metrics
+
 
 class LoggingStatLogger(StatLoggerBase):
     """LoggingStatLogger is used in LLMEngine to log to Stdout."""
@@ -379,6 +386,9 @@ def log(self, stats: Stats) -> None:
         self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
         self.num_generation_tokens.append(stats.num_generation_tokens_iter)
 
+        # Update spec decode metrics
+        self.maybe_update_spec_decode_metrics(stats)
+
         # Log locally every local_interval seconds.
         if local_interval_elapsed(stats.now, self.last_local_log,
                                   self.local_interval):
@@ -408,15 +418,16 @@ def log(self, stats: Stats) -> None:
                 stats.cpu_cache_usage_sys * 100,
             )
 
+            if self.spec_decode_metrics is not None:
+                logger.info(
+                    self._format_spec_decode_metrics_str(
+                        self.spec_decode_metrics))
+
             # Reset tracked stats for next interval.
             self.num_prompt_tokens = []
             self.num_generation_tokens = []
             self.last_local_log = stats.now
-
-            if stats.spec_decode_metrics is not None:
-                logger.info(
-                    self._format_spec_decode_metrics_str(
-                        stats.spec_decode_metrics))
+            self.spec_decode_metrics = None
 
     def _format_spec_decode_metrics_str(
             self, metrics: "SpecDecodeWorkerMetrics") -> str:
@@ -533,6 +544,9 @@ def log(self, stats: Stats):
         self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
         self.num_generation_tokens.append(stats.num_generation_tokens_iter)
 
+        # Update spec decode metrics
+        self.maybe_update_spec_decode_metrics(stats)
+
         # Log locally every local_interval seconds.
         if local_interval_elapsed(stats.now, self.last_local_log,
                                   self.local_interval):
@@ -550,26 +564,27 @@ def log(self, stats: Stats):
                 prompt_throughput=prompt_throughput,
                 generation_throughput=generation_throughput)
 
-            # Reset tracked stats for next interval.
-            self.num_prompt_tokens = []
-            self.num_generation_tokens = []
-            self.last_local_log = stats.now
-
-            if stats.spec_decode_metrics is not None:
+            if self.spec_decode_metrics is not None:
                 self._log_gauge(
                     self.metrics.gauge_spec_decode_draft_acceptance_rate,
-                    stats.spec_decode_metrics.draft_acceptance_rate)
+                    self.spec_decode_metrics.draft_acceptance_rate)
                 self._log_gauge(self.metrics.gauge_spec_decode_efficiency,
-                                stats.spec_decode_metrics.system_efficiency)
+                                self.spec_decode_metrics.system_efficiency)
                 self._log_counter(
                     self.metrics.counter_spec_decode_num_accepted_tokens,
-                    stats.spec_decode_metrics.accepted_tokens)
+                    self.spec_decode_metrics.accepted_tokens)
                 self._log_counter(
                     self.metrics.counter_spec_decode_num_draft_tokens,
-                    stats.spec_decode_metrics.draft_tokens)
+                    self.spec_decode_metrics.draft_tokens)
                 self._log_counter(
                     self.metrics.counter_spec_decode_num_emitted_tokens,
-                    stats.spec_decode_metrics.emitted_tokens)
+                    self.spec_decode_metrics.emitted_tokens)
+
+            # Reset tracked stats for next interval.
+            self.num_prompt_tokens = []
+            self.num_generation_tokens = []
+            self.last_local_log = stats.now
+            self.spec_decode_metrics = None
 
 
 class RayPrometheusStatLogger(PrometheusStatLogger):

From 87525fab925edf549611a1a74a40699b0b5e316e Mon Sep 17 00:00:00 2001
From: dongmao zhang <deanraccoon@gmail.com>
Date: Tue, 23 Jul 2024 16:45:09 -0700
Subject: [PATCH 008/167] [bitsandbytes]: support read bnb pre-quantized model
 (#5753)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 docs/source/index.rst                         |  1 +
 docs/source/quantization/bnb.rst              | 43 +++++++++
 tests/quantization/test_bitsandbytes.py       | 18 +++-
 vllm/config.py                                |  2 +
 vllm/engine/arg_utils.py                      |  4 +-
 .../layers/quantization/bitsandbytes.py       | 25 +-----
 vllm/model_executor/model_loader/loader.py    | 88 ++++++++++++++++---
 .../model_loader/weight_utils.py              |  1 +
 8 files changed, 143 insertions(+), 39 deletions(-)
 create mode 100644 docs/source/quantization/bnb.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2691805ed97a4..ded9a424ee68c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -105,6 +105,7 @@ Documentation
 
    quantization/supported_hardware
    quantization/auto_awq
+   quantization/bnb
    quantization/fp8
    quantization/fp8_e5m2_kvcache
    quantization/fp8_e4m3_kvcache
diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
new file mode 100644
index 0000000000000..aefb54a8acb65
--- /dev/null
+++ b/docs/source/quantization/bnb.rst
@@ -0,0 +1,43 @@
+.. _bits_and_bytes:
+
+BitsAndBytes
+==================
+
+vLLM now supports `BitsAndBytes <https://github.com/TimDettmers/bitsandbytes>`_ for more efficient model inference.
+BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
+Compared to other quantization methods,  BitsAndBytes eliminates the need for calibrating the quantized model with input data.
+
+Below are the steps to utilize BitsAndBytes with vLLM.
+
+.. code-block:: console
+
+    $ pip install bitsandbytes>=0.42.0
+
+vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
+
+You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes.
+And usually, these repositories have a config.json file that includes a quantization_config section.
+
+Read quantized checkpoint.
+--------------------------
+
+.. code-block:: python
+
+    from vllm import LLM
+    import torch
+    # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
+    model_id = "unsloth/tinyllama-bnb-4bit"
+    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+    quantization="bitsandbytes", load_format="bitsandbytes")
+
+Inflight quantization: load as 4bit quantization
+------------------------------------------------
+
+.. code-block:: python
+
+    from vllm import LLM
+    import torch
+    model_id = "huggyllama/llama-7b"
+    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+    quantization="bitsandbytes", load_format="bitsandbytes")
+
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 953fd9ba939c8..b760e9ccb6b74 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -8,15 +8,20 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm import SamplingParams
 
+models_to_test = [
+    ('huggyllama/llama-7b', 'quantize model inflight'),
+    ('lllyasviel/omost-llama-3-8b-4bits', 'read pre-quantized model'),
+]
+
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                     reason='bitsandbytes is not supported on this GPU type.')
-def test_load_bnb_model(vllm_runner) -> None:
-    with vllm_runner('huggyllama/llama-7b',
+@pytest.mark.parametrize("model_name, description", models_to_test)
+def test_load_bnb_model(vllm_runner, model_name, description) -> None:
+    with vllm_runner(model_name,
                      quantization='bitsandbytes',
                      load_format='bitsandbytes',
                      enforce_eager=True) as llm:
-
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
 
         # check the weights in MLP & SelfAttention are quantized to torch.uint8
@@ -65,12 +70,17 @@ def test_load_bnb_model(vllm_runner) -> None:
             'To be or not to be, that is the question.'
         ]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
-
         assert len(outputs) == len(prompts)
 
         for index in range(len(outputs)):
             # compare the first line of the output
             actual_output = outputs[index][1][0].split('\n', 1)[0]
             expected_output = expected_outputs[index].split('\n', 1)[0]
+
+            assert len(actual_output) >= len(expected_output), (
+                f'Actual {actual_output} should be larger than or equal to '
+                f'expected {expected_output}')
+            actual_output = actual_output[:len(expected_output)]
+
             assert actual_output == expected_output, (
                 f'Expected: {expected_output}, but got: {actual_output}')
diff --git a/vllm/config.py b/vllm/config.py
index 6e0283f8379a2..6403a53f86281 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -591,9 +591,11 @@ class LoadConfig:
                 mainly for profiling.
             "tensorizer" will use CoreWeave's tensorizer library for
                 fast weight loading.
+            "bitsandbytes" will load nf4 type weights.
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's 
             checkpoints.
+            
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 05bfe7c24f978..cd64d3345b830 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -676,8 +676,8 @@ def create_engine_config(self, ) -> EngineConfig:
         # bitsandbytes quantization needs a specific model loader
         # so we make sure the quant method and the load format are consistent
         if (self.quantization == "bitsandbytes" or
-            self.qlora_adapter_name_or_path is not None) and \
-            self.load_format != "bitsandbytes":
+           self.qlora_adapter_name_or_path is not None) and \
+           self.load_format != "bitsandbytes":
             raise ValueError(
                 "BitsAndBytes quantization and QLoRA adapter only support "
                 f"'bitsandbytes' load format, but got {self.load_format}")
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 4a68da5a2323e..c143d1a8f2bc7 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -15,19 +15,11 @@ class BitsAndBytesConfig(QuantizationConfig):
     Reference: https://arxiv.org/abs/2305.14314
     """
 
-    def __init__(
-        self,
-        adapter_name_or_path: str,
-        target_modules: List[str],
-    ) -> None:
-
-        self.adapter_name_or_path = adapter_name_or_path
-        self.target_modules = target_modules
+    def __init__(self, ) -> None:
+        pass
 
     def __repr__(self) -> str:
-        return (
-            f"BitsAndBytesConfig(adapter_name_or_path={self.adapter_name_or_path}"
-        )
+        return "BitsAndBytesConfig"
 
     @classmethod
     def get_name(self) -> str:
@@ -49,16 +41,7 @@ def get_config_filenames() -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig":
-        adapter_name = cls.get_from_keys(config, ["adapter_name_or_path"])
-        default_target_modules = [
-            "gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj",
-            "o_proj"
-        ]
-        if adapter_name == "":
-            target_modules = default_target_modules
-        else:
-            target_modules = cls.get_from_keys(config, ["target_modules"])
-        return cls(adapter_name, target_modules)
+        return cls()
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["BitsAndBytesLinearMethod"]:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 88f16918b0119..fe501b9b22da1 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -702,8 +702,14 @@ def _prepare_weights(self, model_name_or_path: str,
 
         return hf_weights_files, matched_pattern == "*.safetensors"
 
+    def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
+        if use_safetensors:
+            return safetensors_weights_iterator(hf_weights_files)
+        else:
+            return pt_weights_iterator(hf_weights_files)
+
     def _get_quantized_weights_iterator(
-        self, model_name_or_path: str, revision: Optional[str]
+        self, model_name_or_path: str, revision: Optional[str], pre_quant: bool
     ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str,
                                                                      Any]]:
         """Get an iterator to the model weights with bitsandbytes quantization,
@@ -712,6 +718,7 @@ def _get_quantized_weights_iterator(
         # only load the bitsandbytes module when needed
         try:
             import bitsandbytes
+            from bitsandbytes.functional import QuantState
             if bitsandbytes.__version__ < "0.42.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
                                   "install bitsandbytes>=0.42.0.")
@@ -725,17 +732,63 @@ def _get_quantized_weights_iterator(
             model_name_or_path, revision)
 
         quant_state_dict = {}
-        if use_safetensors:
-            weight_iterator = safetensors_weights_iterator(hf_weights_files)
-        else:
-            weight_iterator = pt_weights_iterator(hf_weights_files)
 
-        def generator():
+        def quantized_checkpoint() -> Generator:
+            # First iterate over all quant state weights
+            weight_iterator = self._hf_weight_iter(hf_weights_files,
+                                                   use_safetensors)
+            temp_state_dict = {}
             for weight_name, weight_tensor in weight_iterator:
+                if weight_name.endswith(".weight"):
+                    continue
+                # TODO: only nf4 quantization is supported for now
+                if weight_name.endswith(".quant_state.bitsandbytes__fp4"):
+                    raise NotImplementedError(
+                        "Only bitsandbytes_nf4 quantization"
+                        f"is supported for now. {weight_name} is fp4 quantized"
+                    )
+                temp_state_dict[weight_name] = weight_tensor
+
+            # Closure to parse quant_state for each prequant weight
+            def _parse_quant_state(param_name: str,
+                                   temp_state_dict: Dict) -> QuantState:
+                quant_state = {}
+                for k in temp_state_dict:
+                    if param_name + "." in k:
+                        quant_state[k] = temp_state_dict[k]
+                # bitsandbytes library requires
+                # weight.quant_state.bitsandbytes__nf4 in CPU
+                quant_state[param_name +
+                            ".quant_state.bitsandbytes__nf4"] = quant_state[
+                                param_name +
+                                ".quant_state.bitsandbytes__nf4"].cpu().data
+                return QuantState.from_dict(quant_state, device="cuda")
+
+            # Second iterate over all prequant and normal weights
+            # pre quantized weights would have a quant_state
+            for weight_name, weight_tensor in self._hf_weight_iter(
+                    hf_weights_files, use_safetensors):
+                # Filter out all weights whose suffix is not ".weight"
+                if not weight_name.endswith(".weight"):
+                    continue
+                if weight_name + ".quant_state.bitsandbytes__nf4" \
+                    in temp_state_dict:
+                    quant_state = _parse_quant_state(weight_name,
+                                                     temp_state_dict)
+                    weight_name = weight_name.replace(".weight", ".qweight")
+                    quant_state_dict[weight_name] = quant_state
+                    yield weight_name.replace(".weight",
+                                              ".qweight"), weight_tensor
+                else:
+                    yield weight_name, weight_tensor
+
+        def generator() -> Generator:
+            for weight_name, weight_tensor in self._hf_weight_iter(
+                    hf_weights_files, use_safetensors):
                 if any(target_module in weight_name
                        for target_module in self.target_modules):
                     weight_name = weight_name.replace(".weight", ".qweight")
-                    #  bitsandbytes requires data in GPU
+                    # bitsandbytes requires data in GPU
                     loaded_weight = weight_tensor.cuda().data
                     with set_default_torch_dtype(torch.float32):
                         processed_weight, quant_state = quantize_4bit(
@@ -749,6 +802,8 @@ def generator():
 
                 yield weight_name, processed_weight
 
+        if pre_quant:
+            return quantized_checkpoint(), quant_state_dict
         return generator(), quant_state_dict
 
     def _load_weights(self, model_config: ModelConfig,
@@ -766,12 +821,21 @@ def _load_weights(self, model_config: ModelConfig,
         logger.info("Loading weights with BitsAndBytes quantization. "
                     " May take a while ...")
 
-        qweight_iterator, quant_state_dict = (
-            self._get_quantized_weights_iterator(model_config.model,
-                                                 model_config.revision))
+        is_quantized_checkpoint = False
+        quant_config = getattr(model_config.hf_config, "quantization_config",
+                               None)
+        if quant_config is not None and quant_config.get(
+                'quant_method') == "bitsandbytes":
+            is_quantized_checkpoint = True
+
+        qweight_iterator, quant_state_dict = \
+            self._get_quantized_weights_iterator(
+            model_config.model, model_config.revision, is_quantized_checkpoint)
 
         model.load_weights(qweight_iterator)
 
+        torch.cuda.empty_cache()
+
         param_dict = dict(model.named_parameters())
         stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {}
         for quant_param_name in quant_state_dict:
@@ -809,9 +873,9 @@ def _load_weights(self, model_config: ModelConfig,
                         f"pack_factor not set for parameter {param_name}.")
 
                 num_elements = [0] * len(quant_states)
-                for seq, quant_state in enumerate(quant_states.items()):
+                for seq, quant_state in quant_states.items():
                     num_elements[seq] = math.prod(
-                        quant_state[1].shape) // pack_ratio
+                        quant_state.shape) // pack_ratio
 
                 offsets = np.concatenate(([0], np.cumsum(num_elements)))
                 set_weight_attrs(param, {"bnb_shard_offsets": offsets})
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index dbba6ea358346..942215da01af4 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -118,6 +118,7 @@ def convert_bin_to_safetensor_file(
 # TODO(woosuk): Move this to other place.
 def get_quant_config(model_config: ModelConfig,
                      load_config: LoadConfig) -> QuantizationConfig:
+
     quant_cls = get_quantization_config(model_config.quantization)
     # Read the quantization config from the HF model config, if available.
     hf_quant_config = getattr(model_config.hf_config, "quantization_config",

From 5e8ca973ebd5584582923b8ed1d3d823769a80a5 Mon Sep 17 00:00:00 2001
From: William Lin <SolitaryThinker@users.noreply.github.com>
Date: Tue, 23 Jul 2024 18:49:44 -0700
Subject: [PATCH 009/167] [Bugfix] fix flashinfer cudagraph capture for PP
 (#6708)

---
 tests/distributed/test_pipeline_parallel.py | 24 +++++++++++++++++++++
 vllm/worker/model_runner.py                 | 14 ++++++------
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 7f555ed9168a4..d666b8a1d44bd 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -61,3 +61,27 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
         tp_args.append("--enforce-eager")
 
     compare_two_settings(MODEL_NAME, pp_args, tp_args)
+
+
+@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
+    (2, "JackFram/llama-160m"),
+])
+@pytest.mark.parametrize("ATTN_BACKEND", [
+    "FLASH_ATTN",
+    "FLASHINFER",
+])
+def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+    cudagraph_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--distributed-executor-backend",
+        "ray",
+    ]
+    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+
+    eager_args = cudagraph_args + ["--enforce-eager"]
+
+    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index e63be184af16a..073c5a73f739b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1040,9 +1040,9 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                     self.parallel_config.pipeline_parallel_size):
                 for batch_size in reversed(batch_size_capture_list):
                     if self.attn_backend.get_name() == "flashinfer":
-                        indptr_buffer = indptr_buffer[:batch_size + 1]
-                        last_page_len_buffer = last_page_len_buffer[:
-                                                                    batch_size]
+                        _indptr_buffer = indptr_buffer[:batch_size + 1]
+                        _last_page_len_buffer = last_page_len_buffer[:
+                                                                     batch_size]
 
                         num_qo_heads = (
                             self.model_config.get_num_attention_heads(
@@ -1055,8 +1055,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                             use_tensor_cores = False
                         decode_wrapper = \
                             CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
-                            decode_workspace_buffer, indptr_buffer,
-                            indices_buffer, last_page_len_buffer, "NHD",
+                            decode_workspace_buffer, _indptr_buffer,
+                            indices_buffer, _last_page_len_buffer, "NHD",
                             use_tensor_cores)
                         kv_cache_dtype = get_kv_cache_torch_dtype(
                             self.kv_cache_dtype, self.model_config.dtype)
@@ -1131,10 +1131,10 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         self.model, self.attn_backend.get_name())
 
                     if self.attn_backend.get_name() == "flashinfer":
-                        graph_runner.flashinfer_indptr_buffer = indptr_buffer
+                        graph_runner.flashinfer_indptr_buffer = _indptr_buffer
                         graph_runner.flashinfer_indices_buffer = indices_buffer
                         graph_runner.flashinfer_last_page_len_buffer = \
-                            last_page_len_buffer
+                            _last_page_len_buffer
                         graph_runner.flashinfer_decode_workspace_buffer = \
                                 decode_workspace_buffer
                         graph_runner.flashinfer_decode_wrapper = \

From c882a7f5b3ce5c98efb52c911ea15ca565d10cd7 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 24 Jul 2024 00:34:22 -0700
Subject: [PATCH 010/167] [SpecDecoding] Update MLPSpeculator CI tests to use
 smaller model (#6714)

---
 tests/spec_decode/e2e/test_mlp_correctness.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index dd67a7735a647..e310941afacf3 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -24,14 +24,14 @@
 from .conftest import run_greedy_equality_correctness_test
 
 # main model
-MAIN_MODEL = "ibm-granite/granite-3b-code-instruct"
+MAIN_MODEL = "JackFram/llama-160m"
 
 # speculative model
-SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator"
+SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
 
 # max. number of speculative tokens: this corresponds to
 # n_predict in the config.json of the speculator model.
-MAX_SPEC_TOKENS = 5
+MAX_SPEC_TOKENS = 3
 
 # precision
 PRECISION = "float32"

From 0a740a11ba055d6a2ae4786db6510684bfb7a887 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 24 Jul 2024 01:05:09 -0700
Subject: [PATCH 011/167] [Bugfix] Fix token padding for chameleon (#6724)

---
 vllm/model_executor/models/chameleon.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 6ece95495a026..7659f598bab94 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -125,7 +125,8 @@ def input_processor_for_chameleon(ctx: InputContext, llm_inputs: LLMInputs):
 
     # Appending sep token for chat mode to follow default processor
     # behavior
-    new_prompt += tokenizer.sep_token
+    if new_prompt is not None:
+        new_prompt += tokenizer.sep_token
     new_token_ids += [CHAMELEON_SEP_TOKEN_ID]
 
     # NOTE: Create a defensive copy of the original inputs

From ccc4a73257b61d5f1249999f6a804d60c2d1a518 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 24 Jul 2024 01:07:23 -0700
Subject: [PATCH 012/167] [Docs][ROCm] Detailed instructions to build from
 source (#6680)

---
 .../getting_started/amd-installation.rst      | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 61efad2013b2a..71d7527a3e706 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -107,6 +107,35 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
     $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
 
 
+.. tip::
+
+    For example, vLLM v0.5.3 on ROCM 6.1 can be built with the following steps:
+
+    .. code-block:: console
+
+        $ pip install --upgrade pip
+
+        $ # Install PyTorch
+        $ pip uninstall torch -y
+        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
+
+        $ # Build & install AMD SMI
+        $ pip install /opt/rocm/share/amd_smi
+
+        $ # Install dependencies
+        $ pip install --upgrade numba scipy huggingface-hub[cli]
+        $ pip install "numpy<2"
+        $ pip install -r requirements-rocm.txt
+
+        $ # Apply the patch to ROCM 6.1 (requires root permission)
+        $ wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
+        $ rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
+
+        $ # Build vLLM for MI210/MI250/MI300.
+        $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+        $ python3 setup.py develop
+
+
 .. tip::
 
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.

From b5708117060460fcc6ba6f58e0669d3c46d6339e Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Wed, 24 Jul 2024 07:01:14 -0500
Subject: [PATCH 013/167] [Build/CI] Update run-amd-test.sh. Enable Docker Hub
 login. (#6711)

---
 .buildkite/run-amd-test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 618d712b0279b..77e451354caf6 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -55,6 +55,7 @@ while true; do
 done
 
 echo "--- Pulling container" 
+docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
 image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull ${image_name}

From f4f8a9d892a357e341b90bc47a8d72ece62323d5 Mon Sep 17 00:00:00 2001
From: liuyhwangyh <liuyhwangyh@163.com>
Date: Wed, 24 Jul 2024 20:04:46 +0800
Subject: [PATCH 014/167] [Bugfix]fix modelscope compatible issue (#6730)

---
 vllm/model_executor/model_loader/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index fe501b9b22da1..bbe49655020da 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -161,7 +161,7 @@ def _maybe_download_from_modelscope(
                     cache_dir=self.load_config.download_dir,
                     local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
                     revision=revision,
-                    ignore_patterns=self.load_config.ignore_patterns,
+                    ignore_file_pattern=self.load_config.ignore_patterns,
                 )
             else:
                 model_path = model

From 545146349c3c28282cbb60d503f06df473adc932 Mon Sep 17 00:00:00 2001
From: LF Marques <luizfelipe.unesp@gmail.com>
Date: Wed, 24 Jul 2024 16:55:53 +0100
Subject: [PATCH 015/167] Adding f-string to validation error which is missing
 (#6748)

---
 vllm/entrypoints/chat_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 83abc40888137..dca4523d1a27d 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -107,7 +107,7 @@ def _image_token_str(model_config: ModelConfig,
         return tokenizer.decode(model_config.hf_config.image_token_index)
     if model_type == "chameleon":
         return "<image>"
-    raise TypeError("Unknown model type: {model_type}")
+    raise TypeError(f"Unknown model type: {model_type}")
 
 
 # TODO: Let user specify how to insert image tokens into prompt

From 2cf0df3381b74f93dce3215bf5a043b5c47c55f4 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 24 Jul 2024 08:58:31 -0700
Subject: [PATCH 016/167] [Bugfix] Fix speculative decode seeded test (#6743)

---
 tests/spec_decode/e2e/conftest.py  |  3 ++-
 tests/spec_decode/e2e/test_seed.py | 22 +++++++++++++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index bd1ea43f0b101..f9f246436c0f7 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -191,7 +191,8 @@ def generator_inner():
                 and llm.llm_engine.log_stats):
             for sate_logger in llm.llm_engine.stat_loggers.values():
                 sate_logger.local_interval = 0
-        set_random_seed(seed)
+        if seed is not None:
+            set_random_seed(seed)
 
         yield llm
         del llm
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index 792d7cba0f270..394a53f03ed46 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -21,7 +21,8 @@
         "num_speculative_tokens": 3,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
 @pytest.mark.parametrize("batch_size", [1, 8, 32])
 @pytest.mark.parametrize("temperature", [0.1, 1.0])
 @pytest.mark.parametrize(
@@ -30,15 +31,26 @@
         # Use smaller output len for fast test.
         10,
     ])
-@pytest.mark.parametrize("seed", [1])
-def test_seeded_consistency(baseline_llm_generator, batch_size: int,
-                            temperature: float, output_len: int):
+@pytest.mark.parametrize("seed", [None])
+def test_seeded_consistency(baseline_llm_generator, test_llm_generator,
+                            batch_size: int, temperature: float,
+                            output_len: int):
     """Verify outputs are consistent across multiple runs with same seed
     """
     run_equality_correctness_test(baseline_llm_generator,
-                                  baseline_llm_generator,
+                                  test_llm_generator,
                                   batch_size,
                                   max_output_len=output_len,
                                   temperature=temperature,
                                   seeded=True,
                                   force_output_len=True)
+
+    # Ensure this same test does fail if we _don't_ include per-request seeds
+    with pytest.raises(AssertionError):
+        run_equality_correctness_test(baseline_llm_generator,
+                                      test_llm_generator,
+                                      batch_size,
+                                      max_output_len=output_len,
+                                      temperature=temperature,
+                                      seeded=False,
+                                      force_output_len=True)

From 40468b13faa1ebde366e7002c5752b59e1368d10 Mon Sep 17 00:00:00 2001
From: "Allen.Dou" <allen.dou@hotmail.com>
Date: Wed, 24 Jul 2024 23:58:42 +0800
Subject: [PATCH 017/167] [Bugfix] Miscalculated latency lead to
 time_to_first_token_seconds inaccurate. (#6686)

---
 vllm/engine/llm_engine.py              | 3 ++-
 vllm/spec_decode/spec_decode_worker.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index eabe3b23a9d58..48d5305892219 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -949,8 +949,9 @@ def do_log_stats(
             model_output: Optional[List[SamplerOutput]] = None) -> None:
         """Forced log when no requests active."""
         if self.log_stats:
+            stats = self._get_stats(scheduler_outputs, model_output)
             for logger in self.stat_loggers.values():
-                logger.log(self._get_stats(scheduler_outputs, model_output))
+                logger.log(stats)
 
     def _get_stats(
             self,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8cf0aa5b8981a..98960b88f719f 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -484,7 +484,7 @@ def _run_non_driver_rank(self) -> bool:
         for both speculation cases (num_lookahead_slots>0) and non-speculation
         cases (e.g. prefill).
 
-        Returns True iff there are remaining sequences to process.
+        Returns True if there are remaining sequences to process.
         """
         assert self.rank != self._driver_rank
 

From ee812580f7ae3969cf2550cc4dab31490bfea950 Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Wed, 24 Jul 2024 19:36:04 +0200
Subject: [PATCH 018/167] [Frontend] split run_server into build_server and
 run_server (#6740)

---
 vllm/entrypoints/openai/api_server.py | 77 +++++++++++++++++----------
 vllm/scripts.py                       |  3 +-
 2 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 931063d90566c..add5c91900b23 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,6 +2,7 @@
 import importlib
 import inspect
 import re
+import signal
 from contextlib import asynccontextmanager
 from http import HTTPStatus
 from typing import Optional, Set
@@ -213,12 +214,13 @@ async def authentication(request: Request, call_next):
     return app
 
 
-def run_server(args, llm_engine=None):
+async def build_server(
+    args,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+    **uvicorn_kwargs,
+) -> uvicorn.Server:
     app = build_app(args)
 
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    logger.info("args: %s", args)
-
     if args.served_model_name is not None:
         served_model_names = args.served_model_name
     else:
@@ -231,19 +233,7 @@ def run_server(args, llm_engine=None):
               if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                   engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
 
-    event_loop: Optional[asyncio.AbstractEventLoop]
-    try:
-        event_loop = asyncio.get_running_loop()
-    except RuntimeError:
-        event_loop = None
-
-    if event_loop is not None and event_loop.is_running():
-        # If the current is instanced by Ray Serve,
-        # there is already a running event loop
-        model_config = event_loop.run_until_complete(engine.get_model_config())
-    else:
-        # When using single vLLM without engine_use_ray
-        model_config = asyncio.run(engine.get_model_config())
+    model_config = await engine.get_model_config()
 
     if args.disable_log_requests:
         request_logger = None
@@ -296,15 +286,48 @@ def run_server(args, llm_engine=None):
         methods = ', '.join(route.methods)
         logger.info("Route: %s, Methods: %s", route.path, methods)
 
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level=args.uvicorn_log_level,
-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
-                ssl_keyfile=args.ssl_keyfile,
-                ssl_certfile=args.ssl_certfile,
-                ssl_ca_certs=args.ssl_ca_certs,
-                ssl_cert_reqs=args.ssl_cert_reqs)
+    config = uvicorn.Config(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level=args.uvicorn_log_level,
+        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+        ssl_ca_certs=args.ssl_ca_certs,
+        ssl_cert_reqs=args.ssl_cert_reqs,
+        **uvicorn_kwargs,
+    )
+
+    return uvicorn.Server(config)
+
+
+async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    server = await build_server(
+        args,
+        llm_engine,
+        **uvicorn_kwargs,
+    )
+
+    loop = asyncio.get_running_loop()
+
+    server_task = loop.create_task(server.serve())
+
+    def signal_handler() -> None:
+        # prevents the uvicorn signal handler to exit early
+        server_task.cancel()
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+    except asyncio.CancelledError:
+        print("Gracefully stopping http server")
+        await server.shutdown()
 
 
 if __name__ == "__main__":
@@ -314,4 +337,4 @@ def run_server(args, llm_engine=None):
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
     args = parser.parse_args()
-    run_server(args)
+    asyncio.run(run_server(args))
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 3f334be925ee8..aefa5cec93a57 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -1,5 +1,6 @@
 # The CLI entrypoint to vLLM.
 import argparse
+import asyncio
 import os
 import signal
 import sys
@@ -25,7 +26,7 @@ def serve(args: argparse.Namespace) -> None:
     # EngineArgs expects the model name to be passed as --model.
     args.model = args.model_tag
 
-    run_server(args)
+    asyncio.run(run_server(args))
 
 
 def interactive_cli(args: argparse.Namespace) -> None:

From 0e63494cf334497148ee4203b59e34b2dd53e50e Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 24 Jul 2024 11:36:52 -0700
Subject: [PATCH 019/167] Add fp8 support to `reshape_and_cache_flash` (#6667)

---
 csrc/cache.h                          |  3 +-
 csrc/cache_kernels.cu                 | 75 ++++++++++++++++-----------
 csrc/torch_bindings.cpp               |  3 +-
 tests/kernels/test_cache.py           | 42 ++++++++++++---
 vllm/_custom_ops.py                   |  5 +-
 vllm/attention/backends/flash_attn.py |  2 +
 vllm/attention/backends/flashinfer.py |  2 +
 vllm/utils.py                         |  9 +++-
 8 files changed, 98 insertions(+), 43 deletions(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index 52177e8901a89..11c4c5001daaa 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -25,7 +25,8 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              torch::Tensor& key_cache,
                              torch::Tensor& value_cache,
                              torch::Tensor& slot_mapping,
-                             const std::string& kv_cache_dtype);
+                             const std::string& kv_cache_dtype,
+                             const double k_scale, const double v_scale);
 
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index caef7f5e18630..1be806bbfa43c 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -203,17 +203,18 @@ __global__ void reshape_and_cache_kernel(
   }
 }
 
-template <typename scalar_t>
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
     const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
     const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
-    scalar_t* __restrict__ k_cache,      // [num_blocks, block_size, num_heads,
+    cache_t* __restrict__ key_cache,     // [num_blocks, block_size, num_heads,
                                          // head_size]
-    scalar_t* __restrict__ v_cache,      // [num_blocks, block_size, num_heads,
+    cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
                                          // head_size]
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int block_stride, const int key_stride, const int value_stride,
-    const int num_heads, const int head_size, const int block_size) {
+    const int num_heads, const int head_size, const int block_size,
+    const float k_scale, const float v_scale) {
   const int64_t token_idx = blockIdx.x;
   const int64_t slot_idx = slot_mapping[token_idx];
   // NOTE: slot_idx can be -1 if the token is padded
@@ -228,11 +229,20 @@ __global__ void reshape_and_cache_flash_kernel(
     const int64_t src_value_idx = token_idx * value_stride + i;
     const int head_idx = i / head_size;
     const int head_offset = i % head_size;
-    const int64_t tgt_value_idx = block_idx * block_stride +
-                                  block_offset * num_heads * head_size +
-                                  head_idx * head_size + head_offset;
-    k_cache[tgt_value_idx] = key[src_key_idx];
-    v_cache[tgt_value_idx] = value[src_value_idx];
+    const int64_t tgt_key_value_idx = block_idx * block_stride +
+                                      block_offset * num_heads * head_size +
+                                      head_idx * head_size + head_offset;
+    scalar_t tgt_key = key[src_key_idx];
+    scalar_t tgt_value = value[src_value_idx];
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      key_cache[tgt_key_value_idx] = tgt_key;
+      value_cache[tgt_key_value_idx] = tgt_value;
+    } else {
+      key_cache[tgt_key_value_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
+      value_cache[tgt_key_value_idx] =
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
+    }
   }
 }
 }  // namespace vllm
@@ -278,40 +288,45 @@ void reshape_and_cache(
                              CALL_RESHAPE_AND_CACHE)
 }
 
+// KV_T is the stored data type of kv-cache.
+// CACHE_T is the data type of key and value tensors.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
+  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
+      <<<grid, block, 0, stream>>>(                                   \
+          reinterpret_cast<KV_T*>(key.data_ptr()),                    \
+          reinterpret_cast<KV_T*>(value.data_ptr()),                  \
+          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
+          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
+          slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
+          value_stride, num_heads, head_size, block_size, k_scale, v_scale);
+
 void reshape_and_cache_flash(
-    torch::Tensor& key,      // [num_tokens, num_heads, head_size]
-    torch::Tensor& value,    // [num_tokens, num_heads, head_size]
-    torch::Tensor& k_cache,  // [num_blocks, block_size, num_heads, head_size]
-    torch::Tensor& v_cache,  // [num_blocks, block_size, num_heads, head_size]
+    torch::Tensor& key,        // [num_tokens, num_heads, head_size]
+    torch::Tensor& value,      // [num_tokens, num_heads, head_size]
+    torch::Tensor& key_cache,  // [num_blocks, block_size, num_heads, head_size]
+    torch::Tensor&
+        value_cache,  // [num_blocks, block_size, num_heads, head_size]
     torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype) {
-  // FIXME: only support auto datatype, does not support fp8
-  if (kv_cache_dtype != "auto") {
-    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
-  }
+    const std::string& kv_cache_dtype, const double k_scale,
+    const double v_scale) {
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
-  int block_size = k_cache.size(1);
+  int block_size = key_cache.size(1);
 
   int key_stride = key.stride(0);
   int value_stride = value.stride(0);
-  int block_stride = k_cache.stride(0);
-  TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
+  int block_stride = key_cache.stride(0);
+  TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
 
   dim3 grid(num_tokens);
   dim3 block(std::min(num_heads * head_size, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      key.scalar_type(), "reshape_and_cache_flash", [&] {
-        vllm::reshape_and_cache_flash_kernel<scalar_t>
-            <<<grid, block, 0, stream>>>(
-                key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-                k_cache.data_ptr<scalar_t>(), v_cache.data_ptr<scalar_t>(),
-                slot_mapping.data_ptr<int64_t>(), block_stride, key_stride,
-                value_stride, num_heads, head_size, block_size);
-      });
+
+  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
+                             CALL_RESHAPE_AND_CACHE_FLASH);
 }
 
 namespace vllm {
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 0df9bdb75018f..3027b63ba2b33 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -248,7 +248,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                        Tensor! key_cache,"
       "                        Tensor! value_cache,"
       "                        Tensor slot_mapping,"
-      "                        str kv_cache_dtype) -> ()");
+      "                        str kv_cache_dtype,"
+      "                        float k_scale, float v_scale) -> ()");
   cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
                  &reshape_and_cache_flash);
 
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 70ae3d0c6e0c3..f9a609464abfc 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -215,8 +215,6 @@ def test_reshape_and_cache_flash(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
-    if kv_cache_dtype == "fp8":
-        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     torch.cuda.manual_seed(seed)
@@ -248,15 +246,33 @@ def test_reshape_and_cache_flash(
         dtype,
         device=device,
     )
-    key_cache, value_cache = key_caches[0], value_caches[0]
+    key_cache, value_cache = key_caches[0].contiguous(
+    ), value_caches[0].contiguous()
+    del key_caches
+    del value_caches
 
     # Clone the KV caches.
-    cloned_key_cache = key_cache.clone()
-    cloned_value_cache = value_cache.clone()
+    if kv_cache_dtype == "fp8":
+        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(cloned_key_cache, key_cache)
+        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(cloned_value_cache, value_cache)
+    else:
+        cloned_key_cache = key_cache.clone()
+        cloned_value_cache = value_cache.clone()
+
+    # Using default kv_scale
+    k_scale = v_scale = 1.0
 
     # Call the reshape_and_cache kernel.
     ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
-                                slot_mapping, kv_cache_dtype)
+                                slot_mapping, kv_cache_dtype, k_scale, v_scale)
+
+    if kv_cache_dtype == "fp8":
+        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(result_key_cache, key_cache)
+        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(result_value_cache, value_cache)
 
     # Run the reference implementation.
     block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
@@ -269,8 +285,18 @@ def test_reshape_and_cache_flash(
         cloned_key_cache[block_idx, block_offset, :, :] = key[i]
         cloned_value_cache[block_idx, block_offset, :, :] = value[i]
 
-    assert torch.allclose(key_cache, cloned_key_cache)
-    assert torch.allclose(value_cache, cloned_value_cache)
+    if kv_cache_dtype == "fp8":
+        assert torch.allclose(result_key_cache,
+                              cloned_key_cache,
+                              atol=0.001,
+                              rtol=0.1)
+        assert torch.allclose(result_value_cache,
+                              cloned_value_cache,
+                              atol=0.001,
+                              rtol=0.1)
+    else:
+        assert torch.allclose(key_cache, cloned_key_cache)
+        assert torch.allclose(value_cache, cloned_value_cache)
 
 
 @pytest.mark.parametrize("direction", COPYING_DIRECTION)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e5151c070f2f7..0186594656cc1 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -426,10 +426,13 @@ def reshape_and_cache_flash(
     value_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
+    k_scale: float,
+    v_scale: float,
 ) -> None:
     torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
                                                    value_cache, slot_mapping,
-                                                   kv_cache_dtype)
+                                                   kv_cache_dtype, k_scale,
+                                                   v_scale)
 
 
 def copy_blocks(key_caches: List[torch.Tensor],
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index b16a204c8f44e..949bd973cf3c4 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -478,6 +478,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
+                k_scale,
+                v_scale,
             )
 
         num_prefill_tokens = attn_metadata.num_prefill_tokens
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 9dac12d3b906d..2a4900489df35 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -489,6 +489,8 @@ def forward(
                 kv_cache[:, 1],
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
+                k_scale,
+                v_scale,
             )
 
         query = query.contiguous(
diff --git a/vllm/utils.py b/vllm/utils.py
index 83605631b5bd6..876c3bf90b02c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -491,7 +491,6 @@ def create_kv_caches_with_random_flash(
     seed: int = 0,
     device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-    assert cache_dtype != "fp8"
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
@@ -507,7 +506,13 @@ def create_kv_caches_with_random_flash(
         key_value_cache = torch.empty(size=key_value_cache_shape,
                                       dtype=torch_dtype,
                                       device=device)
-        key_value_cache.uniform_(-scale, scale)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            key_value_cache.uniform_(-scale, scale)
+        elif cache_dtype == 'fp8':
+            _generate_random_fp8(key_value_cache, -scale, scale)
+        else:
+            raise ValueError(
+                f"Does not support key cache of type {cache_dtype}")
         key_caches.append(key_value_cache[:, 0])
         value_caches.append(key_value_cache[:, 1])
     return key_caches, value_caches

From 5448f67635570cee6fc23c7cd166e9d8f7595009 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 24 Jul 2024 12:17:12 -0700
Subject: [PATCH 020/167] [Core] Tweaks to model runner/input builder developer
 APIs (#6712)

---
 vllm/attention/backends/flashinfer.py |  35 ++++---
 vllm/worker/embedding_model_runner.py |   4 +-
 vllm/worker/model_runner.py           | 134 +++++++++++++++++---------
 3 files changed, 109 insertions(+), 64 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 2a4900489df35..9746304347d6e 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -297,23 +297,26 @@ def _add_seq_group(
             if is_profile_run:
                 return
 
-            # Get the number of valid blocks based on sequence length.
-            # If seq_len = 16, block_size = 16,
-            # block_table_bound is 1 with 1 valid block.
-            # If seq_len = 15, block_size = 16,
-            # block_table_bound is 0 + 1 with 1 valid block.
-            block_table_bound = seq_len // self.block_size + 1 \
-                                if seq_len % self.block_size != 0 \
-                                else seq_len // self.block_size
             block_table = block_tables[seq_id]
-            self.paged_kv_indices.extend(block_table[:block_table_bound])
-            self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
-                                        block_table_bound)
-
-            last_page_len = seq_len % self.block_size
-            if last_page_len == 0:
-                last_page_len = self.block_size
-            self.paged_kv_last_page_len.append(last_page_len)
+            self._update_paged_kv_tensors(block_table, seq_len)
+
+    def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
+        # Get the number of valid blocks based on sequence length.
+        # If seq_len = 16, block_size = 16,
+        # block_table_bound is 1 with 1 valid block.
+        # If seq_len = 15, block_size = 16,
+        # block_table_bound is 0 + 1 with 1 valid block.
+        block_table_bound = seq_len // self.block_size + 1 \
+                            if seq_len % self.block_size != 0 \
+                            else seq_len // self.block_size
+        self.paged_kv_indices.extend(block_table[:block_table_bound])
+        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
+                                    block_table_bound)
+
+        last_page_len = seq_len % self.block_size
+        if last_page_len == 0:
+            last_page_len = self.block_size
+        self.paged_kv_last_page_len.append(last_page_len)
 
     def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int):
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index a333e6634a41f..e919dbd18d9df 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -11,7 +11,8 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
                            SequenceGroupMetadata)
-from vllm.worker.model_runner import GPUModelRunnerBase, ModelInputForGPU
+from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU,
+                                      ModelInputForGPUBuilder)
 
 logger = init_logger(__name__)
 
@@ -28,6 +29,7 @@ class EmbeddingModelRunner(
         GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
     _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
         ModelInputForGPUWithPoolingMetadata)
+    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
 
     def __init__(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 073c5a73f739b..86d26b4a84c36 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -3,7 +3,7 @@
 import time
 import warnings
 import weakref
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set,
                     Tuple, Type, TypeVar, Union)
 
@@ -171,48 +171,83 @@ def from_broadcasted_tensor_dict(
 class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
     """Build ModelInputForGPU from SequenceGroupMetadata."""
 
-    @dataclass
+    # Note: ideally we would be using a dataclass(kw_only=True)
+    # here, so that this can be subclassed easily,
+    # but kw_only is not supported in python<3.10.
     class InterDataForSeqGroup:
         """Intermediate data for the current sequence group."""
-        # From sequence group metadata.
-        request_id: str
-        seq_ids: List[int]
-        is_prompt: bool
-        block_tables: Optional[Dict[int, List[int]]]
-        computed_block_nums: List[int]
-        n_seqs: int = 0
-
-        # Input tokens and positions.
-        input_tokens: List[List[int]] = field(default_factory=list)
-        input_positions: List[List[int]] = field(default_factory=list)
-
-        # The sequence length (may be capped to the sliding window).
-        seq_lens: List[int] = field(default_factory=list)
-        # The original sequence length (before applying sliding window).
-        # This is used to compute slot mapping.
-        orig_seq_lens: List[int] = field(default_factory=list)
-        # The query length.
-        query_lens: List[int] = field(default_factory=list)
-        # The number of tokens that are already computed.
-        context_lens: List[int] = field(default_factory=list)
-        # The current sliding window block.
-        curr_sliding_window_blocks: List[int] = field(default_factory=list)
-
-        # LoRA inputs.
-        lora_index_mapping: List[List[int]] = field(default_factory=list)
-        lora_prompt_mapping: List[List[int]] = field(default_factory=list)
-        lora_requests: Set[LoRARequest] = field(default_factory=set)
-
-        # Prompt adapter inputs.
-        prompt_adapter_index_mapping: List[int] = field(default_factory=list)
-        prompt_adapter_prompt_mapping: List[int] = field(default_factory=list)
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None
-
-        # Multi-modal inputs.
-        multi_modal_inputs: Optional[MultiModalInputs] = None
-
-        # Whether the prefix cache is hit (prefill only).
-        prefix_cache_hit: bool = False
+
+        def __init__(
+            self,
+            *,
+            # From sequence group metadata.
+            request_id: str,
+            seq_ids: List[int],
+            is_prompt: bool,
+            block_tables: Optional[Dict[int, List[int]]],
+            computed_block_nums: List[int],
+            n_seqs: int = 0,
+
+            # Input tokens and positions.
+            input_tokens: Optional[List[List[int]]] = None,
+            input_positions: Optional[List[List[int]]] = None,
+
+            # The sequence length (may be capped to the sliding window).
+            seq_lens: Optional[List[int]] = None,
+            # The original sequence length (before applying sliding window).
+            # This is used to compute slot mapping.
+            orig_seq_lens: Optional[List[int]] = None,
+            # The query length.
+            query_lens: Optional[List[int]] = None,
+            # The number of tokens that are already computed.
+            context_lens: Optional[List[int]] = None,
+            # The current sliding window block.
+            curr_sliding_window_blocks: Optional[List[int]] = None,
+
+            # LoRA inputs.
+            lora_index_mapping: Optional[List[List[int]]] = None,
+            lora_prompt_mapping: Optional[List[List[int]]] = None,
+            lora_requests: Optional[Set[LoRARequest]] = None,
+
+            # Prompt adapter inputs.
+            prompt_adapter_index_mapping: Optional[List[int]] = None,
+            prompt_adapter_prompt_mapping: Optional[List[int]] = None,
+            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+
+            # Multi-modal inputs.
+            multi_modal_inputs: Optional[MultiModalInputs] = None,
+
+            # Whether the prefix cache is hit (prefill only).
+            prefix_cache_hit: bool = False,
+        ):
+            self.request_id = request_id
+            self.seq_ids = seq_ids
+            self.is_prompt = is_prompt
+            self.block_tables = block_tables
+            self.computed_block_nums = computed_block_nums
+            self.n_seqs = n_seqs
+            self.input_tokens = input_tokens or []
+            self.input_positions = input_positions or []
+            self.seq_lens = seq_lens or []
+            self.orig_seq_lens = orig_seq_lens or []
+            self.query_lens = query_lens or []
+            self.context_lens = context_lens or []
+            self.curr_sliding_window_blocks = curr_sliding_window_blocks or []
+
+            self.lora_index_mapping = lora_index_mapping or []
+            self.lora_prompt_mapping = lora_prompt_mapping or []
+            self.lora_requests = lora_requests or set()
+
+            self.prompt_adapter_index_mapping = (prompt_adapter_index_mapping
+                                                 or [])
+            self.prompt_adapter_prompt_mapping = (prompt_adapter_prompt_mapping
+                                                  or [])
+            self.prompt_adapter_request = prompt_adapter_request
+
+            self.multi_modal_inputs = multi_modal_inputs
+            self.prefix_cache_hit = prefix_cache_hit
+
+            self.__post_init__()
 
         def __post_init__(self):
             self.n_seqs = len(self.seq_ids)
@@ -457,6 +492,12 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
         for per_seq_group_fn in self.per_seq_group_compute_fns:
             per_seq_group_fn(inter_data, seq_group_metadata)
 
+    def _use_captured_graph(self, batch_size: int,
+                            max_decode_seq_len: int) -> bool:
+        return (self.decode_only and not self.runner.model_config.enforce_eager
+                and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
+                and max_decode_seq_len <= self.runner.max_seq_len_to_capture)
+
     def build(self) -> ModelInputForGPU:
         """Finalize the builder intermediate data and
         create on-device tensors.
@@ -491,10 +532,8 @@ def build(self) -> ModelInputForGPU:
         }
 
         batch_size = len(input_tokens)
-        use_captured_graph = (
-            self.decode_only and not self.runner.model_config.enforce_eager
-            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
-            and max_decode_seq_len <= self.runner.max_seq_len_to_capture)
+        use_captured_graph = self._use_captured_graph(batch_size,
+                                                      max_decode_seq_len)
 
         # If cuda graph can be used, pad tensors accordingly.
         # See `capture_model` API for more details.
@@ -592,6 +631,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
     Helper class for shared methods between GPU model runners.
     """
     _model_input_cls: Type[TModelInputForGPU]
+    _builder_cls: Type[ModelInputForGPUBuilder]
 
     def __init__(
         self,
@@ -794,8 +834,7 @@ def _prepare_model_input_tensors(
 
         If cuda graph is required, this API automatically pads inputs.
         """
-        builder = ModelInputForGPUBuilder(weakref.proxy(self),
-                                          finished_requests_ids)
+        builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
         for seq_group_metadata in seq_group_metadata_list:
             builder.add_seq_group(seq_group_metadata)
         return builder.build()  # type: ignore
@@ -1191,6 +1230,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
     """
     _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
         ModelInputForGPUWithSamplingMetadata)
+    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
 
     def make_model_input_from_broadcasted_tensor_dict(
         self,

From 421e218b37bd98b52bb3737c5aacc5a60fd460c0 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 24 Jul 2024 16:22:16 -0400
Subject: [PATCH 021/167] [Bugfix] Bump transformers to 4.43.2 (#6752)

---
 requirements-common.txt |  2 +-
 tests/test_config.py    | 55 +++++++++++++++++++++--------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 940740722c7e6..3b8d473c1fe7a 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.43.1  # Required for Chameleon and Llama 3.1 hotfox.
+transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
diff --git a/tests/test_config.py b/tests/test_config.py
index 9f7d85e39ad67..225d71c0bc0ea 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -64,8 +64,9 @@ def test_get_sliding_window():
 
 
 def test_rope_customization():
-    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
+    TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
     TEST_ROPE_THETA = 16_000_000.0
+    LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
@@ -95,29 +96,29 @@ def test_rope_customization():
                    None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
-    # TODO: add these back when the rope configs are fixed
-    # LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
-    # longchat_model_config = ModelConfig(
-    #     "lmsys/longchat-13b-16k",
-    #     "lmsys/longchat-13b-16k",
-    #     tokenizer_mode="auto",
-    #     trust_remote_code=False,
-    #     dtype="float16",
-    #     seed=0,
-    # )
-    # assert getattr(longchat_model_config.hf_config, "rope_scaling",
-    #                None) == LONGCHAT_ROPE_SCALING
-    # assert longchat_model_config.max_model_len == 16384
-
-    # longchat_model_config = ModelConfig(
-    #     "lmsys/longchat-13b-16k",
-    #     "lmsys/longchat-13b-16k",
-    #     tokenizer_mode="auto",
-    #     trust_remote_code=False,
-    #     dtype="float16",
-    #     seed=0,
-    #     rope_scaling=TEST_ROPE_SCALING,
-    # )
-    # assert getattr(longchat_model_config.hf_config, "rope_scaling",
-    #                None) == TEST_ROPE_SCALING
-    # assert longchat_model_config.max_model_len == 4096
+    longchat_model_config = ModelConfig(
+        "lmsys/longchat-13b-16k",
+        "lmsys/longchat-13b-16k",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
+    assert all(
+        longchat_model_config.hf_config.rope_scaling.get(key) == value
+        for key, value in LONGCHAT_ROPE_SCALING.items())
+    assert longchat_model_config.max_model_len == 16384
+
+    longchat_model_config = ModelConfig(
+        "lmsys/longchat-13b-16k",
+        "lmsys/longchat-13b-16k",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+        rope_scaling=TEST_ROPE_SCALING,
+    )
+    assert getattr(longchat_model_config.hf_config, "rope_scaling",
+                   None) == TEST_ROPE_SCALING
+    assert longchat_model_config.max_model_len == 4096

From d88c458f44f5bc0d01215310f8abb5d63fa106d4 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Wed, 24 Jul 2024 17:32:57 -0400
Subject: [PATCH 022/167] [Doc][AMD][ROCm]Added tips to refer to mi300x tuning
 guide for mi300x users (#6754)

---
 docs/source/getting_started/amd-installation.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 71d7527a3e706..1c7d274b7c47e 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -142,3 +142,10 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
     - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
     - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
     - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+
+
+.. tip::
+    - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html>`_ for performance optimization and tuning tips on system and workflow level.
+      For vLLM, please refer to `vLLM performance optimization <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization>`_.
+
+

From 740374d456a638df98ffbc7d9dab328752330e62 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 24 Jul 2024 17:37:12 -0700
Subject: [PATCH 023/167] [core][distributed] fix zmq hang (#6759)

---
 vllm/connections.py                           |  4 +-
 .../device_communicators/shm_broadcast.py     | 60 +++++++------------
 2 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/vllm/connections.py b/vllm/connections.py
index 65d44176e2464..e785a0b3ebd74 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Mapping, Optional
+from typing import Mapping, MutableMapping, Optional
 from urllib.parse import urlparse
 
 import aiohttp
@@ -40,7 +40,7 @@ def _validate_http_url(self, url: str):
             raise ValueError("Invalid HTTP URL: A valid HTTP URL "
                              "must have scheme 'http' or 'https'.")
 
-    def _headers(self, **extras: str) -> Mapping[str, str]:
+    def _headers(self, **extras: str) -> MutableMapping[str, str]:
         return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
 
     def get_response(
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 75d84c7a71bc3..d4847542688c0 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -9,7 +9,7 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
-from zmq import PUB, REP, REQ, SUB, SUBSCRIBE, Context  # type: ignore
+from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -153,9 +153,7 @@ class Handle:
 
     buffer: Optional[ShmRingBuffer] = None
     local_subscribe_port: Optional[int] = None
-    local_sync_port: Optional[int] = None
     remote_subscribe_port: Optional[int] = None
-    remote_sync_port: Optional[int] = None
 
 
 class MessageQueue:
@@ -189,38 +187,36 @@ def __init__(
             self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes,
                                         max_chunks)
 
-            self.local_socket = context.socket(PUB)
+            # XPUB is very similar to PUB,
+            # except that it can receive subscription messages
+            # to confirm the number of subscribers
+            self.local_socket = context.socket(XPUB)
+            # set the verbose option so that we can receive every subscription
+            # message. otherwise, we will only receive the first subscription
+            # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
+            self.local_socket.setsockopt(XPUB_VERBOSE, True)
             local_subscribe_port = get_open_port()
             self.local_socket.bind(f"tcp://*:{local_subscribe_port}")
 
-            self.local_sync_socket = context.socket(REP)
-            local_sync_port = get_open_port()
-            self.local_sync_socket.bind(f"tcp://*:{local_sync_port}")
             self.current_idx = 0
 
         else:
             self.buffer = None  # type: ignore
             local_subscribe_port = None
-            local_sync_port = None
             self.local_socket = None
-            self.local_sync_socket = None
             self.current_idx = -1
 
         if n_remote_reader > 0:
             # for remote readers, we will:
             # create a publish-subscribe socket to communicate large data
-            self.remote_socket = context.socket(PUB)
+            self.remote_socket = context.socket(XPUB)
+            self.remote_socket.setsockopt(XPUB_VERBOSE, True)
             remote_subscribe_port = get_open_port()
             self.remote_socket.bind(f"tcp://*:{remote_subscribe_port}")
 
-            self.remote_sync_socket = context.socket(REP)
-            remote_sync_port = get_open_port()
-            self.remote_sync_socket.bind(f"tcp://*:{remote_sync_port}")
         else:
             remote_subscribe_port = None
-            remote_sync_port = None
             self.remote_socket = None
-            self.remote_sync_socket = None
 
         self._is_writer = True
         self._is_local_reader = False
@@ -233,9 +229,7 @@ def __init__(
             local_reader_ranks=local_reader_ranks,
             buffer=self.buffer,
             local_subscribe_port=local_subscribe_port,
-            local_sync_port=local_sync_port,
             remote_subscribe_port=remote_subscribe_port,
-            remote_sync_port=remote_sync_port,
         )
 
         logger.info("vLLM message queue communication handle: %s", self.handle)
@@ -264,12 +258,7 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
             self.local_socket.connect(
                 f"tcp://{handle.connect_ip}:{handle.local_subscribe_port}")
 
-            self.local_sync_socket = context.socket(REQ)
-            self.local_sync_socket.connect(
-                f"tcp://{handle.connect_ip}:{handle.local_sync_port}")
-
             self.remote_socket = None
-            self.remote_sync_socket = None
         else:
             self.buffer = None  # type: ignore
             self.current_idx = -1
@@ -278,17 +267,12 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
             self._is_remote_reader = True
 
             self.local_socket = None
-            self.local_sync_socket = None
 
             self.remote_socket = context.socket(SUB)
             self.remote_socket.setsockopt_string(SUBSCRIBE, "")
             self.remote_socket.connect(
                 f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}")
 
-            self.remote_sync_socket = context.socket(REQ)
-            self.remote_sync_socket.connect(
-                f"tcp://{handle.connect_ip}:{handle.remote_sync_port}")
-
         return self
 
     def wait_until_ready(self):
@@ -300,29 +284,27 @@ def wait_until_ready(self):
 
             # local readers
             for i in range(self.n_local_reader):
-                recv = self.local_sync_socket.recv()
-                assert recv == b"READY"
-                self.local_sync_socket.send(b"READY")
+                # wait for subscription messages from all local readers
+                self.local_socket.recv()
             if self.n_local_reader > 0:
+                # send a message to all local readers
+                # to make sure the publish channel is working
                 self.local_socket.send(b"READY")
 
             # remote readers
             for i in range(self.n_remote_reader):
-                recv = self.remote_sync_socket.recv()
-                assert recv == b"READY"
-                self.remote_sync_socket.send(b"READY")
+                # wait for subscription messages from all remote readers
+                self.remote_socket.recv()
             if self.n_remote_reader > 0:
+                # send a message to all remote readers
+                # to make sure the publish channel is working
                 self.remote_socket.send(b"READY")
         elif self._is_local_reader:
-            self.local_sync_socket.send(b"READY")
-            recv = self.local_sync_socket.recv()
-            assert recv == b"READY"
+            # wait for the writer to send a message
             recv = self.local_socket.recv()
             assert recv == b"READY"
         elif self._is_remote_reader:
-            self.remote_sync_socket.send(b"READY")
-            recv = self.remote_sync_socket.recv()
-            assert recv == b"READY"
+            # wait for the writer to send a message
             recv = self.remote_socket.recv()
             assert recv == b"READY"
 

From 5689e256baf0c45148a01ad147abf11ad82c9690 Mon Sep 17 00:00:00 2001
From: "Evan Z. Liu" <ezliu@users.noreply.github.com>
Date: Wed, 24 Jul 2024 18:51:00 -0700
Subject: [PATCH 024/167] [Frontend] Represent tokens with identifiable strings
 (#6626)

---
 tests/entrypoints/openai/test_completion.py   | 10 ++-
 .../openai/test_return_tokens_as_ids.py       | 83 +++++++++++++++++++
 vllm/entrypoints/openai/api_server.py         |  2 +
 vllm/entrypoints/openai/cli_args.py           |  6 ++
 vllm/entrypoints/openai/serving_chat.py       | 23 +++--
 vllm/entrypoints/openai/serving_completion.py | 19 ++++-
 vllm/entrypoints/openai/serving_engine.py     | 14 ++--
 7 files changed, 138 insertions(+), 19 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_return_tokens_as_ids.py

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 0896e337b5d24..fe00640c0021e 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -55,8 +55,9 @@ def zephyr_pa_files():
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
-    args = [
+def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
+                        zephyr_pa_files):
+    return [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -85,7 +86,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
         "128",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
new file mode 100644
index 0000000000000..abe413978e0e5
--- /dev/null
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -0,0 +1,83 @@
+# Separate these tests out from test_completion and test_chat, because they
+# require launching a second server with a different flag. Running both servers
+# at the same time on a single node will OOM.
+
+import pytest
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+from .test_completion import default_server_args  # noqa: F401
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
+from .test_completion import zephyr_pa_files  # noqa: F401
+from .test_completion import MODEL_NAME
+
+
+@pytest.fixture(scope="module")
+def server_with_return_tokens_as_token_ids_flag(
+        default_server_args):  # noqa: F811
+    args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
+    with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_completion_return_tokens_as_token_ids_completion(
+        server_with_return_tokens_as_token_ids_flag):
+    client = server_with_return_tokens_as_token_ids_flag.get_async_client()
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        # Include Unicode characters to test for dividing a single
+        # character across multiple tokens: 🎉 is [28705, 31862] for the
+        # Zephyr tokenizer
+        prompt="Say 'Hello, world! 🎉'",
+        echo=True,
+        temperature=0,
+        max_tokens=10,
+        logprobs=1)
+
+    text = completion.choices[0].text
+    token_strs = completion.choices[0].logprobs.tokens
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # Check that the token representations are consistent between raw tokens
+    # and top_logprobs
+    # Slice off the first one, because there's no scoring associated with BOS
+    top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
+    top_logprob_keys = [
+        next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
+    ]
+    assert token_strs[1:] == top_logprob_keys
+
+    # Check that decoding the tokens gives the expected text
+    tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
+    assert text == tokenizer.decode(tokens, skip_special_tokens=True)
+
+
+@pytest.mark.asyncio
+async def test_chat_return_tokens_as_token_ids_completion(
+        server_with_return_tokens_as_token_ids_flag):
+    client = server_with_return_tokens_as_token_ids_flag.get_async_client()
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        # Include Unicode characters to test for dividing a single
+        # character across multiple tokens: 🎉 is [28705, 31862] for the
+        # Zephyr tokenizer
+        messages=[{
+            "role": "system",
+            "content": "You like to respond in only emojis, like 🎉"
+        }, {
+            "role": "user",
+            "content": "Please write some emojis: 🐱🐶🎉"
+        }],
+        temperature=0,
+        max_tokens=8,
+        logprobs=True)
+
+    text = response.choices[0].message.content
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    token_ids = []
+    for logprob_content in response.choices[0].logprobs.content:
+        token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
+    assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index add5c91900b23..0fe4dd245b5e6 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -254,6 +254,7 @@ async def build_server(
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         chat_template=args.chat_template,
+        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     )
     openai_serving_completion = OpenAIServingCompletion(
         engine,
@@ -262,6 +263,7 @@ async def build_server(
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
+        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     )
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 64919c8be8642..a4192937980f7 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -128,6 +128,12 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "using @app.middleware('http'). "
         "If a class is provided, vLLM will add it to the server "
         "using app.add_middleware(). ")
+    parser.add_argument(
+        "--return-tokens-as-token-ids",
+        action="store_true",
+        help="When --max-logprobs is specified, represents single tokens as"
+        "strings of the form 'token_id:{token_id}' so that tokens that"
+        "are not JSON-encodable can be identified.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 3899509ef3ff4..012f70e661100 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -50,13 +50,15 @@ def __init__(
         prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
+        return_tokens_as_token_ids: bool = False,
     ):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
                          prompt_adapters=prompt_adapters,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids)
 
         self.response_role = response_role
 
@@ -522,11 +524,14 @@ def _get_top_logprobs(
             self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int],
             tokenizer: PreTrainedTokenizer) -> List[ChatCompletionLogProb]:
         return [
-            ChatCompletionLogProb(
-                token=(token := self._get_decoded_token(p[1], p[0],
-                                                        tokenizer)),
-                logprob=max(p[1].logprob, -9999.0),
-                bytes=list(token.encode("utf-8", errors="replace")))
+            ChatCompletionLogProb(token=(token := self._get_decoded_token(
+                p[1],
+                p[0],
+                tokenizer,
+                return_as_token_id=self.return_tokens_as_token_ids)),
+                                  logprob=max(p[1].logprob, -9999.0),
+                                  bytes=list(
+                                      token.encode("utf-8", errors="replace")))
             for i, p in enumerate(logprobs.items())
             if top_logprobs and i < top_logprobs
         ]
@@ -546,6 +551,8 @@ def _create_chat_logprobs(
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
                 token = tokenizer.decode(token_id)
+                if self.return_tokens_as_token_ids:
+                    token = f"token_id:{token_id}"
                 logprobs_content.append(
                     ChatCompletionLogProbsContent(
                         token=token,
@@ -553,7 +560,9 @@ def _create_chat_logprobs(
             else:
                 logprobs_content.append(
                     ChatCompletionLogProbsContent(
-                        token=step_top_logprobs[token_id].decoded_token,
+                        token=self._get_decoded_token(
+                            step_top_logprobs[token_id], token_id, tokenizer,
+                            self.return_tokens_as_token_ids),
                         logprob=max(step_top_logprobs[token_id].logprob,
                                     -9999.0),
                         bytes=list(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 6aef4c9f96150..73e420141813e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -51,13 +51,15 @@ def __init__(
         lora_modules: Optional[List[LoRAModulePath]],
         prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
     ):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
                          prompt_adapters=prompt_adapters,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids)
 
     async def create_completion(self, request: CompletionRequest,
                                 raw_request: Request):
@@ -430,12 +432,17 @@ def _create_completion_logprobs(
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
                 token = tokenizer.decode(token_id)
+                if self.return_tokens_as_token_ids:
+                    token = f"token_id:{token_id}"
                 out_tokens.append(token)
                 out_token_logprobs.append(None)
                 out_top_logprobs.append(None)
             else:
-                token = self._get_decoded_token(step_top_logprobs[token_id],
-                                                token_id, tokenizer)
+                token = self._get_decoded_token(
+                    step_top_logprobs[token_id],
+                    token_id,
+                    tokenizer,
+                    return_as_token_id=self.return_tokens_as_token_ids)
                 token_logprob = max(step_top_logprobs[token_id].logprob,
                                     -9999.0)
                 out_tokens.append(token)
@@ -448,7 +455,11 @@ def _create_completion_logprobs(
                 out_top_logprobs.append({
                     # Convert float("-inf") to the
                     # JSON-serializable float that OpenAI uses
-                    self._get_decoded_token(top_lp[1], top_lp[0], tokenizer):
+                    self._get_decoded_token(
+                        top_lp[1],
+                        top_lp[0],
+                        tokenizer,
+                        return_as_token_id=self.return_tokens_as_token_ids):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8c6bd10b9b4d4..321c9ac2c1d5f 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -68,6 +68,7 @@ def __init__(
         lora_modules: Optional[List[LoRAModulePath]],
         prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
     ):
         super().__init__()
 
@@ -102,6 +103,7 @@ def __init__(
                         prompt_adapter_num_virtual_tokens=num_virtual_tokens))
 
         self.request_logger = request_logger
+        self.return_tokens_as_token_ids = return_tokens_as_token_ids
 
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
@@ -384,11 +386,13 @@ def _log_inputs(
         )
 
     @staticmethod
-    def _get_decoded_token(
-        logprob: Logprob,
-        token_id: int,
-        tokenizer: AnyTokenizer,
-    ) -> str:
+    def _get_decoded_token(logprob: Logprob,
+                           token_id: int,
+                           tokenizer: AnyTokenizer,
+                           return_as_token_id: bool = False) -> str:
+        if return_as_token_id:
+            return f"token_id:{token_id}"
+
         if logprob.decoded_token is not None:
             return logprob.decoded_token
         return tokenizer.decode(token_id)

From 9e169a4c619c33ec4f9a14c5e971e3aa34bc4444 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Thu, 25 Jul 2024 11:59:30 +0800
Subject: [PATCH 025/167] [Model] Adding support for MiniCPM-V (#4087)

---
 .../dev/multimodal/multimodal_index.rst       |   2 +
 docs/source/models/supported_models.rst       |   4 +
 examples/minicpmv_example.py                  |  53 ++
 tests/conftest.py                             |  11 +-
 tests/models/test_minicpmv.py                 | 163 +++++
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/llama.py           |   4 +-
 vllm/model_executor/models/minicpm.py         |   3 +-
 vllm/model_executor/models/minicpmv.py        | 682 ++++++++++++++++++
 vllm/multimodal/__init__.py                   |   3 +-
 vllm/multimodal/base.py                       |  34 +-
 11 files changed, 942 insertions(+), 18 deletions(-)
 create mode 100644 examples/minicpmv_example.py
 create mode 100644 tests/models/test_minicpmv.py
 create mode 100644 vllm/model_executor/models/minicpmv.py

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 7cdbec2c9e3d4..9784f4cc2e088 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -40,6 +40,8 @@ Registry
 Base Classes
 ------------
 
+.. autodata:: vllm.multimodal.NestedTensors
+
 .. autodata:: vllm.multimodal.BatchedTensors
 
 .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 068c00da39cd9..dc8bd6fb245df 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -206,6 +206,10 @@ Vision Language Models
     - Phi-3-Vision
     - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
     -
+  * - :code:`MiniCPM-V`
+    - MiniCPM-V
+    - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
+    -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py
new file mode 100644
index 0000000000000..52366a7030ad0
--- /dev/null
+++ b/examples/minicpmv_example.py
@@ -0,0 +1,53 @@
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
+
+# 2.0
+# MODEL_NAME = "HwwwH/MiniCPM-V-2"
+# 2.5
+MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
+
+image = ImageAsset("stop_sign").pil_image.convert("RGB")
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+llm = LLM(model=MODEL_NAME,
+          gpu_memory_utilization=1,
+          trust_remote_code=True,
+          max_model_len=4096)
+
+messages = [{
+    'role':
+    'user',
+    'content':
+    '(<image>./</image>)\n' + "What's the content of the image?"
+}]
+prompt = tokenizer.apply_chat_template(messages,
+                                       tokenize=False,
+                                       add_generation_prompt=True)
+# 2.0
+# stop_token_ids = [tokenizer.eos_id]
+# 2.5
+stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+sampling_params = SamplingParams(
+    stop_token_ids=stop_token_ids,
+    # temperature=0.7,
+    # top_p=0.8,
+    # top_k=100,
+    # seed=3472,
+    max_tokens=1024,
+    # min_tokens=150,
+    temperature=0,
+    use_beam_search=True,
+    # length_penalty=1.2,
+    best_of=3)
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": {
+        "image": image
+    }
+},
+                       sampling_params=sampling_params)
+print(outputs[0].outputs[0].text)
diff --git a/tests/conftest.py b/tests/conftest.py
index 7f507310cd255..59510075b0063 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,7 +11,7 @@
 import torch.nn.functional as F
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoTokenizer, BatchEncoding)
+                          AutoTokenizer, BatchEncoding, BatchFeature)
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
@@ -133,7 +133,7 @@ def image_assets() -> _ImageAssets:
     return IMAGE_ASSETS
 
 
-_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
+_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
 
 
 class HfRunner:
@@ -339,7 +339,6 @@ def generate_greedy_logprobs_limit(
                 processor_kwargs["images"] = images[i]
 
             inputs = self.processor(**processor_kwargs)
-            input_ids = inputs.input_ids
 
             output = self.model.generate(
                 **self.wrap_device(inputs),
@@ -381,7 +380,7 @@ def generate_greedy_logprobs_limit(
 
             all_logprobs.append(seq_logprobs_lst)
             seq_ids = output.sequences[0]
-            output_len = seq_ids.shape[0] - input_ids.shape[1]
+            output_len = len(seq_logprobs_lst)
             output_ids = seq_ids[-output_len:]
             all_output_ids.append(output_ids.tolist())
             all_output_strs.append(self.tokenizer.decode(output_ids))
@@ -514,10 +513,12 @@ def generate_greedy_logprobs(
         max_tokens: int,
         num_logprobs: int,
         images: Optional[List[Image.Image]] = None,
+        stop_token_ids: Optional[List[int]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                 max_tokens=max_tokens,
-                                                logprobs=num_logprobs)
+                                                logprobs=num_logprobs,
+                                                stop_token_ids=stop_token_ids)
         outputs = self.generate_w_logprobs(prompts,
                                            greedy_logprobs_params,
                                            images=images)
diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py
new file mode 100644
index 0000000000000..9124fa7a6238c
--- /dev/null
+++ b/tests/models/test_minicpmv.py
@@ -0,0 +1,163 @@
+from collections import UserDict
+from typing import List, Optional, Tuple, Type
+
+import pytest
+import torch
+import torch.types
+from transformers import BatchFeature
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
+        "(<image>./</image>)\nWhat's the content of the image?<|eot_id|>" \
+        "<|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+    "cherry_blossom":
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
+        "(<image>./</image>)\nWhat is the season?<|eot_id|>" \
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+})
+
+models = ["openbmb/MiniCPM-Llama3-V-2_5"]
+
+
+def trunc_hf_output(hf_output: Tuple[List[int], str,
+                                     Optional[SampleLogprobs]]):
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<|eot_id|>"):
+        output_str = output_str.split("<|eot_id|>")[0]
+    return output_ids, output_str, out_logprobs
+
+
+target_dtype = "half"
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images,
+                                                stop_token_ids=stop_token_ids)
+            for prompts, vllm_images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
+
+        class NestedInputs(UserDict):
+
+            def __init__(self, model_inputs: BatchFeature):
+                super().__init__({"model_inputs": model_inputs})
+
+                self.model_inputs = model_inputs
+
+            def to(self, device: torch.types.Device):
+                return NestedInputs(self.model_inputs.to(device))
+
+        hf_processor = hf_model.processor
+        hf_model.processor = lambda **kw: NestedInputs(
+            hf_processor(**kw)  # type: ignore
+        )
+
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    tokenizer=tokenizer)
+            for prompts, hf_images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=[
+                trunc_hf_output(hf_output) for hf_output in hf_outputs
+            ],
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 55a039a88d535..7df5b8fa64710 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -50,6 +50,7 @@
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
+    "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2052c443a8885..306d22e42ed1d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -418,9 +418,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  input_embeds)
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 4ccf1cf0fad76..7a8ac0bb1f949 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -463,10 +463,11 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        input_embeds: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, input_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
new file mode 100644
index 0000000000000..8563216d9c392
--- /dev/null
+++ b/vllm/model_executor/models/minicpmv.py
@@ -0,0 +1,682 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-V-2 model compatible with HuggingFace weights."""
+import math
+import re
+from functools import partial
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch import nn
+from torch.nn.init import trunc_normal_
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.idefics2.modeling_idefics2 import (
+    Idefics2VisionTransformer)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsVision
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import (cached_get_image_processor,
+                                   cached_get_tokenizer)
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.lm_head": "lm_head",
+    "language_model.model": "language_model",
+}
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: (H, W)
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    # tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    return F.interpolate(
+        abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+        size=(tgt_size[0], tgt_size[1]),
+        mode="bicubic",
+        align_corners=False,
+    ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim,
+                            grid_size,
+                            cls_token=False,
+                            version=2.0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or 
+                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    if version == 2.0:
+        grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+        if cls_token:
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed],
+                                       axis=0)
+    else:
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version=2.0):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0], version)  # (H*W, D/2) or (H, W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1], version)  # (H*W, D/2) or (H, W, D/2)
+
+    if version == 2.0:
+        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    else:
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, version=2.0):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,) / (H, W)
+    out: (M, D) / (H, W, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    if version == 2.0:
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    else:
+        out = np.einsum('hw,d->hwd', pos, omega)  # (H, W, D/2), outer product
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+    return emb
+
+
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+
+    default_norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+    def __init__(self,
+                 num_queries,
+                 grid_size,
+                 embed_dim,
+                 num_heads,
+                 kv_dim=None,
+                 norm_layer=default_norm_layer,
+                 adaptive=False,
+                 max_size=(70, 70),
+                 version=2.0):
+        super().__init__()
+
+        self.version = version
+        if self.version == 2.0:
+            self.num_queries = grid_size**2
+        else:
+            self.num_queries = num_queries
+            self.max_size = max_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.adaptive = adaptive
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+
+        self.ln_post = norm_layer(embed_dim)
+        self.proj = nn.Parameter(
+            (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
+
+        if self.version == 2.0:
+            self.pos_embed = nn.Parameter(
+                torch.from_numpy(
+                    get_2d_sincos_pos_embed(
+                        embed_dim, grid_size,
+                        version=self.version)).float()).requires_grad_(False)
+        else:
+            self._set_2d_pos_cache(self.max_size)
+
+        self.apply(self._init_weights)
+
+    def _set_2d_pos_cache(self, max_size, device='cpu'):
+        pos_embed = torch.from_numpy(
+            get_2d_sincos_pos_embed(self.embed_dim,
+                                    max_size,
+                                    version=self.version)).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(self, tgt_sizes, device):
+        max_h = torch.max(tgt_sizes[:, 0])
+        max_w = torch.max(tgt_sizes[:, 1])
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = [
+                max(max_h, self.max_size[0]),
+                max(max_w, self.max_size[1])
+            ]
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward_2_5(self, x, tgt_sizes=None):
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        max_patch_len = torch.max(patch_len)
+        key_padding_mask = torch.zeros((bs, max_patch_len),
+                                       dtype=torch.bool,
+                                       device=device)
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
+                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
+            key_padding_mask[i, patch_len[i]:] = True
+
+        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed,
+                                                    batch_first=True,
+                                                    padding_value=0.0).permute(
+                                                        1, 0,
+                                                        2)  # BLD => L * B * D
+
+        x = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+        q = self.ln_q(self.query)  # Q * D
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            x + pos_embed,  # L * B * D +  L * B * D
+            x,
+            key_padding_mask=key_padding_mask)[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+    def forward_2(self, x, tgt_sizes=None, attn_mask=None):
+        if self.adaptive:
+            pos_embed = torch.Tensor(
+                get_2d_sincos_pos_embed(self.embed_dim,
+                                        tgt_sizes)).float().to(device=x.device,
+                                                               dtype=x.dtype)
+        else:
+            pos_embed = get_abs_pos(self.pos_embed, tgt_sizes)
+
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+                        x + pos_embed.unsqueeze(1),
+                        x,
+                        attn_mask=attn_mask)[0]
+        x = out.permute(1, 0, 2)
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+    def forward(self, x, tgt_sizes=None, attn_mask=None):
+        if self.version == 2.0:
+            return self.forward_2(x, tgt_sizes=tgt_sizes, attn_mask=attn_mask)
+        else:
+            return self.forward_2_5(x, tgt_sizes=tgt_sizes)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+def get_max_minicpmv_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    return getattr(hf_config, "query_num", 64)
+
+
+def dummy_seq_data_for_minicpmv(seq_len: int):
+    token_ids = [0] * seq_len
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_minicpmv(hf_config):
+    width = height = hf_config.image_size
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+
+    # image_feature_size = get_max_minicpmv_image_tokens(ctx)
+
+    seq_data = dummy_seq_data_for_minicpmv(seq_len)
+
+    mm_data = dummy_image_for_minicpmv(hf_config)
+
+    return seq_data, mm_data
+
+
+def input_processor_for_minicpmv(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+
+    prompt = llm_inputs.get("prompt")
+    if prompt is None:
+        token_ids = llm_inputs.get("prompt_token_ids")
+        prompt = tokenizer.decode(token_ids)
+    image_processor = cached_get_image_processor(model_config.tokenizer)
+
+    pattern = "(<image>./</image>)"
+    image = multi_modal_data["image"]
+    image_tags = re.findall(pattern, prompt)
+    assert len(image_tags) <= 1
+    text_chunks = prompt.split(pattern)
+    new_prompt = text_chunks[0] \
+        + image_processor.get_slice_image_placeholder(image.size) \
+        + text_chunks[1]
+
+    new_token_ids = tokenizer.encode(new_prompt)
+
+    llm_inputs = LLMInputs(prompt_token_ids=new_token_ids,
+                           prompt=new_prompt,
+                           multi_modal_data=multi_modal_data)
+    return llm_inputs
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
+class MiniCPMV(nn.Module, SupportsVision):
+
+    def __init__(
+        self,
+        config,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.version = float(self.config.version)
+        self.llm = self.init_llm(config, cache_config, quant_config)
+        self.vpm = self.init_vision_module()
+        param_dtype = torch.get_default_dtype()
+        self.vpm.to(dtype=param_dtype)
+        self.vision_dim = self.vpm.embed_dim if self.version == 2.0 \
+            else self.vpm.embeddings.embed_dim
+        self.embed_dim = self.llm.config.hidden_size
+        self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
+        self.resampler.to(device="cuda", dtype=param_dtype)
+        self.sampler = Sampler()
+
+    def init_llm(self, config, cache_config, quant_config):
+        if self.version == 2.0:
+            return MiniCPMForCausalLM(config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config)
+        else:
+            return LlamaForCausalLM(config,
+                                    cache_config=cache_config,
+                                    quant_config=quant_config)
+
+    def init_vision_module(self):
+        if self.version == 2.0:
+            try:
+                import timm
+            except ImportError:
+                raise ImportError(
+                    'Please install timm==0.9.10') from ImportError
+            default_dtype = torch.get_default_dtype()
+            torch.set_default_dtype(torch.float16)
+            model = timm.create_model('vit_so400m_patch14_siglip_384.webli',
+                                      pretrained=False,
+                                      num_classes=0,
+                                      dynamic_img_size=True,
+                                      dynamic_img_pad=True)
+            torch.set_default_dtype(default_dtype)
+            if isinstance(model, timm.models.VisionTransformer
+                          ) and model.attn_pool is not None:
+                model.attn_pool = torch.nn.Identity()
+
+            if self.config.drop_vision_last_layer:
+                model.blocks = model.blocks[:-1]
+        else:
+            model = Idefics2VisionTransformer(self.config.vision_config)
+            if self.config.drop_vision_last_layer:
+                model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(self, embed_dim, vision_dim):
+        default_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(torch.float16)
+        if self.version == 2.0:
+            resampler = Resampler(grid_size=int(
+                math.sqrt(self.config.query_num)),
+                                  num_queries=None,
+                                  embed_dim=embed_dim,
+                                  num_heads=embed_dim // 128,
+                                  kv_dim=vision_dim,
+                                  adaptive=True,
+                                  version=self.version)
+        else:
+            resampler = Resampler(num_queries=self.config.query_num,
+                                  grid_size=None,
+                                  embed_dim=embed_dim,
+                                  num_heads=embed_dim // 128,
+                                  kv_dim=vision_dim,
+                                  adaptive=True,
+                                  version=self.version)
+        torch.set_default_dtype(default_dtype)
+        return resampler
+
+    def get_vision_embedding(self,
+                             pixel_values,
+                             patch_attn_mask=None,
+                             tgt_sizes=None,
+                             version=2.0):
+        if version == 2.0:
+            res = []
+            dtype = self.vpm.pos_embed.data.dtype
+            for pixel_value in pixel_values:
+                # V2.0 start
+                H, W = pixel_value[0].shape[-2:]
+                tgt_size = (math.ceil(H / self.vpm.patch_embed.patch_size[0]),
+                            math.ceil(W / self.vpm.patch_embed.patch_size[0]))
+                # V2.0 end
+                vision_embedding = self.vpm.forward_features(
+                    pixel_value.unsqueeze(0).type(dtype))
+                if hasattr(self.vpm, 'num_prefix_tokens'
+                           ) and self.vpm.num_prefix_tokens > 0:
+                    vision_embedding = vision_embedding[:, self.vpm.
+                                                        num_prefix_tokens:]
+                res.append(self.resampler(vision_embedding, tgt_size))
+            return torch.vstack(res)
+        else:
+            vision_embedding = self.vpm(
+                pixel_values.type(dtype),
+                patch_attention_mask=patch_attn_mask).last_hidden_state
+            vision_embedding = self.resampler(vision_embedding, tgt_sizes)
+
+    def get_image_bounds(self, input_ids):
+        tokenizer = cached_get_tokenizer(self.config._name_or_path,
+                                         trust_remote_code=True)
+        im_start_token_id = tokenizer.im_start_id
+        im_end_token_id = tokenizer.im_end_id
+        image_start_tokens = torch.where(input_ids == im_start_token_id)[0]
+        image_start_tokens += 1
+        image_end_tokens = torch.where(input_ids == im_end_token_id)[0]
+        valid_image_nums = min(len(image_start_tokens), len(image_end_tokens))
+        if valid_image_nums == 0:
+            return []
+        image_bound = torch.hstack([
+            image_start_tokens[:valid_image_nums].unsqueeze(-1),
+            image_end_tokens[:valid_image_nums].unsqueeze(-1),
+        ])
+
+        return image_bound
+
+    def get_vision_hidden_states(self, data):
+        if "vision_hidden_states" not in data:
+            pixel_values = data["pixel_values"]
+            tgt_sizes = data["tgt_sizes"]
+            vision_hidden_states = []
+            if self.version == 2.0:
+                if pixel_values is not None and len(pixel_values) > 0:
+                    vision_hidden_states = self.get_vision_embedding(
+                        pixel_values)
+                else:
+                    vision_hidden_states = torch.tensor([]).to(
+                        data["input_ids"].device)
+            else:
+                device = self.vpm.embeddings.position_embedding.weight.device
+                dtype = self.vpm.embeddings.position_embedding.weight.dtype
+                all_pixel_values = [
+                    i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+                ]
+                if all_pixel_values:
+                    tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
+                    max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+                    all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+                        all_pixel_values, batch_first=True, padding_value=0.0)
+                    B, L, _ = all_pixel_values.shape
+                    all_pixel_values = all_pixel_values.permute(
+                        0, 2, 1).reshape(B, 3, -1, L)
+
+                    patch_attn_mask = torch.zeros((B, 1, max_patches),
+                                                  dtype=torch.bool,
+                                                  device=device)
+                    for i in range(B):
+                        patch_attn_mask[i, :tgt_sizes[i][0] *
+                                        tgt_sizes[i][1]] = True
+
+                    vision_embedding = self.vpm(
+                        all_pixel_values.type(dtype),
+                        patch_attention_mask=patch_attn_mask).last_hidden_state
+                    vision_hidden_states = self.resampler(
+                        vision_embedding, tgt_sizes)
+
+                else:  # no image
+                    dummy_feature = []
+                    vision_hidden_states = dummy_feature
+        else:
+            vision_hidden_states = data["vision_hidden_states"]
+
+        return vision_hidden_states
+
+    def get_embedding(self, data):
+        input_ids = data["input_ids"]
+
+        vision_hidden_states = self.get_vision_hidden_states(data)
+        if vision_hidden_states is not None and len(vision_hidden_states) > 0:
+            image_bounds = self.get_image_bounds(input_ids)
+        else:
+            image_bounds = []
+
+        if hasattr(self.llm.config, 'scale_emb'):
+            vlm_embedding = self.llm.model.embed_tokens(
+                input_ids) * self.llm.config.scale_emb
+        else:
+            vlm_embedding = self.llm.model.embed_tokens(input_ids)
+        vision_hidden_states = [
+            i.type(vlm_embedding.dtype) if isinstance(i, torch.Tensor) else i
+            for i in vision_hidden_states
+        ]
+
+        if len(vision_hidden_states) > 0 and len(image_bounds) > 0:
+            vision_hidden_states = torch.cat(vision_hidden_states, dim=0)
+            image_indices = torch.stack([
+                torch.arange(r[0], r[1], dtype=torch.long)
+                for r in image_bounds
+            ]).to(vlm_embedding.device)
+            vlm_embedding.scatter_(
+                0,
+                image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
+                vision_hidden_states.view(-1, vision_hidden_states.shape[-1]))
+        return vlm_embedding, vision_hidden_states
+
+    def process_multimodal_inputs(self, inputs):
+        pixel_values = []
+        tgt_sizes = []
+        for b in range(len(inputs["pixel_values"])):
+            pixel_values += inputs["pixel_values"][b]
+            tgt_sizes += inputs["tgt_sizes"][b]
+        return {
+            "pixel_values": pixel_values,
+            "input_ids": inputs["input_ids"],
+            "tgt_sizes": tgt_sizes
+        }
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ):
+        inputs = {
+            "pixel_values": kwargs.pop("pixel_values", []),
+            "input_ids": input_ids,
+            "tgt_sizes": kwargs.pop("tgt_sizes", None),
+        }
+
+        inputs = self.process_multimodal_inputs(inputs)
+
+        vlm_embeddings, vision_hidden_states = self.get_embedding(inputs)
+
+        output = self.llm(input_ids=None,
+                          positions=positions,
+                          kv_caches=kv_caches,
+                          attn_metadata=attn_metadata,
+                          intermediate_tensors=intermediate_tensors,
+                          input_embeds=vlm_embeddings)
+        return output
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        return self.llm.compute_logits(hidden_states, sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.llm.sample(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            #     for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+            #         if key_to_modify in name:
+            #             name = name.replace(key_to_modify, new_key)
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            use_default_weight_loading = False
+            if "vpm" in name or 'resampler' in name:
+                # We only do sharding for language model and
+                # not vision model for now.
+                use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    param = params_dict[name.replace(weight_name, param_name)]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    use_default_weight_loading = True
+            if use_default_weight_loading:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 503dceab5b168..0e3b35d425cb7 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,5 @@
 from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
-                   MultiModalInputs, MultiModalPlugin)
+                   MultiModalInputs, MultiModalPlugin, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -17,6 +17,7 @@
     "MultiModalDataDict",
     "MultiModalInputs",
     "MultiModalPlugin",
+    "NestedTensors",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 3ebc25c5930cf..0d435bd644e29 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,7 +2,7 @@
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict,
-                    TypeVar, Union)
+                    TypeVar, Union, cast)
 
 import torch
 import torch.types
@@ -15,10 +15,17 @@
 
 logger = init_logger(__name__)
 
-BatchedTensors = Union[torch.Tensor, List[torch.Tensor]]
+NestedTensors = Union[List[torch.Tensor], torch.Tensor]
+"""
+Use a list instead of a tensor if the dimensions of each element do not match.
+Currently only supports up to singly nested list of tensors.
+"""
+
+BatchedTensors = Union[List[NestedTensors], NestedTensors]
 """
 If each input tensor in the batch has the same size, this is a single batched
-tensor; otherwise, this is a list of tensors with one element per batch.
+tensor; otherwise, this is a list of :class:`NestedTensors` with one element
+per item in the batch.
 """
 
 if sys.version_info < (3, 9):
@@ -27,7 +34,7 @@ class _MultiModalInputsBase(UserDict):
         pass
 else:
 
-    class _MultiModalInputsBase(UserDict[str, torch.Tensor]):
+    class _MultiModalInputsBase(UserDict[str, NestedTensors]):
         pass
 
 
@@ -39,19 +46,26 @@ class MultiModalInputs(_MultiModalInputsBase):
 
     @staticmethod
     def try_concat(
-        tensors: List[torch.Tensor],
+        tensors: List[NestedTensors],
         *,
         device: torch.types.Device,
     ) -> BatchedTensors:
-        unbatched_shape = tensors[0].shape[1:]
+        # may be list rather than tensors
+        if isinstance(tensors[0], list):
+            return [[t.to(device=device) for t in tensor[0]]
+                    for tensor in tensors]
+
+        tensors_ = cast(List[torch.Tensor], tensors)
+
+        unbatched_shape = tensors_[0].shape[1:]
 
-        for tensor in tensors:
+        for tensor in tensors_:
             if tensor.shape[1:] != unbatched_shape:
                 return [
-                    tensor.squeeze(0).to(device=device) for tensor in tensors
+                    tensor.squeeze(0).to(device=device) for tensor in tensors_
                 ]
 
-        return torch.cat(tensors, dim=0).to(device=device)
+        return torch.cat(tensors_, dim=0).to(device=device)
 
     @staticmethod
     def batch(
@@ -64,7 +78,7 @@ def batch(
 
         keys = inputs_list[0].keys()
 
-        item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list)
+        item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
 
         for inputs in inputs_list:
             if inputs.keys() != keys:

From 309aaef8255fb832bf674c6ed7d9d84211629421 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 24 Jul 2024 22:33:56 -0700
Subject: [PATCH 026/167] [Bugfix] Fix decode tokens w. CUDA graph (#6757)

---
 tests/worker/test_model_runner.py     |  1 +
 vllm/attention/backends/flash_attn.py | 12 ++++++++++--
 vllm/attention/backends/flashinfer.py | 11 ++++++++++-
 vllm/attention/backends/utils.py      | 11 ++++++++++-
 4 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index b5742c4338616..4a0e2b4184936 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -193,6 +193,7 @@ def test_prepare_decode_cuda_graph(batch_size):
     for _ in range(expected_bs - len(seq_lens)):
         seq_lens.append(1)
     assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.num_decode_tokens == len(seq_lens)
     start_idx = 0
     start_loc = [start_idx]
     for _ in context_lens:
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 949bd973cf3c4..7d7aff9dc3cdc 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -272,7 +272,15 @@ def _add_seq_group(
 
     def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int):
-        """Build attention metadata with on-device tensors."""
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
         for inter_data in self.input_builder.inter_data_list:
             self._add_seq_group(inter_data,
                                 self.input_builder.chunked_prefill_enabled)
@@ -297,7 +305,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
             self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size + cuda_graph_pad_size
+            num_decode_tokens = batch_size
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 9746304347d6e..83a420d76834b 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -320,6 +320,15 @@ def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
 
     def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
         for inter_data in self.input_builder.inter_data_list:
             self._add_seq_group(inter_data,
                                 self.input_builder.chunked_prefill_enabled)
@@ -334,7 +343,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
             self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size + cuda_graph_pad_size
+            num_decode_tokens = batch_size
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 5877712b9b7d3..dcd10ed410a79 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -149,6 +149,15 @@ def _add_seq_group(
 
     def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
         for inter_data in self.input_builder.inter_data_list:
             self._add_seq_group(inter_data,
                                 self.input_builder.chunked_prefill_enabled)
@@ -173,7 +182,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
             self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size + cuda_graph_pad_size
+            num_decode_tokens = batch_size
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].

From 0310029a2fc62171fae87155150326125e082a5a Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Thu, 25 Jul 2024 01:34:11 -0400
Subject: [PATCH 027/167] [Bugfix] Fix awq_marlin and gptq_marlin flags (#6745)

---
 vllm/model_executor/layers/quantization/awq_marlin.py  | 5 +++--
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 092f87b623e7f..5ffbb8e854e87 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -25,7 +25,7 @@ class AWQMarlinConfig(QuantizationConfig):
     def __init__(self, weight_bits: int, group_size: int, has_zp: bool,
                  lm_head_quantized: bool) -> None:
         self.weight_bits = weight_bits
-        self.pack_factor = 32 // self.weight_bits  # packed into int32
+        self.pack_factor = 32 // self.weight_bits  # packed into 32bits
         self.group_size = group_size
         self.has_zp = has_zp
         self.lm_head_quantized = lm_head_quantized
@@ -69,7 +69,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
     def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
         can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
-        is_valid_user_quant = (user_quant is None or user_quant == "marlin")
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
+                               or user_quant == "awq_marlin")
 
         if can_convert and is_valid_user_quant:
             msg = ("The model is convertible to {} during runtime."
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 5b4d614ae2e74..bdcc9c3b4f0c5 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -79,7 +79,8 @@ def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
         can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
 
-        is_valid_user_quant = (user_quant is None or user_quant == "marlin")
+        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
+                               or user_quant == "gptq_marlin")
 
         if can_convert and is_valid_user_quant:
             msg = ("The model is convertible to {} during runtime."

From 316a41ac1de4e6e46933cadb39b9b7af65b01abd Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 24 Jul 2024 22:48:07 -0700
Subject: [PATCH 028/167] [Bugfix] Fix encoding_format in
 examples/openai_embedding_client.py (#6755)

---
 examples/openai_embedding_client.py        | 13 ++++++++-----
 tests/entrypoints/openai/test_embedding.py |  1 -
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/openai_embedding_client.py b/examples/openai_embedding_client.py
index b73360fe15a24..b4f4c7ad6beb2 100644
--- a/examples/openai_embedding_client.py
+++ b/examples/openai_embedding_client.py
@@ -13,11 +13,14 @@
 models = client.models.list()
 model = models.data[0].id
 
-responses = client.embeddings.create(input=[
-    "Hello my name is",
-    "The best thing about vLLM is that it supports many different models"
-],
-                                     model=model)
+responses = client.embeddings.create(
+    input=[
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ],
+    model=model,
+    encoding_format="float",
+)
 
 for data in responses.data:
     print(data.embedding)  # list of float of len 4096
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 2ca0c0d63c25c..c9747339bbf15 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -18,7 +18,6 @@ def embedding_server():
         "--enforce-eager",
         "--max-model-len",
         "8192",
-        "--enforce-eager",
     ]
 
     with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:

From b75e314fff29bdc94b2fb1dd78519e92f9520e65 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Fri, 26 Jul 2024 00:42:49 +0800
Subject: [PATCH 029/167] [Bugfix] Add image placeholder for OpenAI Compatible
 Server of MiniCPM-V (#6787)

Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 examples/minicpmv_example.py   | 2 ++
 vllm/entrypoints/chat_utils.py | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py
index 52366a7030ad0..bf20a7ea04ad4 100644
--- a/examples/minicpmv_example.py
+++ b/examples/minicpmv_example.py
@@ -4,6 +4,8 @@
 from vllm.assets.image import ImageAsset
 
 # 2.0
+# The official repo doesn't work yet, so we need to use a fork for now
+# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 # MODEL_NAME = "HwwwH/MiniCPM-V-2"
 # 2.5
 MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index dca4523d1a27d..1f6d77b828459 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -100,7 +100,9 @@ def _image_token_str(model_config: ModelConfig,
     if model_type == "phi3_v":
         # Workaround since this token is not defined in the tokenizer
         return "<|image_1|>"
-    if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv", "paligemma"):
+    if model_type == "minicpmv":
+        return "(<image>./</image>)"
+    if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"):
         # These models do not use image tokens in the prompt
         return None
     if model_type.startswith("llava"):

From 889da130e747b1382268ed428352f2e73e51a30b Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 25 Jul 2024 09:46:04 -0700
Subject: [PATCH 030/167] [ Misc ] `fp8-marlin` channelwise via
 `compressed-tensors` (#6524)

Co-authored-by: mgoin <michael@neuralmagic.com>
---
 .../configs/Qwen2-1.5B-Instruct-FP8W8.yaml    |  11 ++
 .../lm-eval-harness/configs/models-small.txt  |   1 +
 .../compressed_tensors/compressed_tensors.py  |  61 ++++++++--
 .../compressed_tensors/schemes/__init__.py    |   2 +
 .../schemes/compressed_tensors_scheme.py      |   3 +-
 .../schemes/compressed_tensors_unquantized.py |   3 +-
 .../schemes/compressed_tensors_w4a16_24.py    |   3 +-
 .../schemes/compressed_tensors_w8a16_fp8.py   | 105 ++++++++++++++++++
 .../schemes/compressed_tensors_w8a8_fp8.py    |  10 +-
 .../schemes/compressed_tensors_w8a8_int8.py   |  19 ++--
 .../schemes/compressed_tensors_wNa16.py       |   3 +-
 .../model_executor/layers/quantization/fp8.py |  33 ++++--
 .../quantization/utils/marlin_utils_fp8.py    |  14 +--
 13 files changed, 219 insertions(+), 49 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py

diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
new file mode 100644
index 0000000000000..42936fbfbe7d4
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.578
+  - name: "exact_match,flexible-extract"
+    value: 0.585
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 1d1b0ed38671d..109692395acf6 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -5,3 +5,4 @@ Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-FP8W8.yaml
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index c4d0c9cb981da..39d00bd5733ff 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -10,7 +10,8 @@
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
     CompressedTensorsScheme, CompressedTensorsUnquantized,
     CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsWNA16)
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     QuantizationType, find_matched_target, is_activation_quantization_format,
@@ -100,14 +101,18 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
     def get_config_filenames(cls) -> List[str]:
         return []
 
-    def _check_scheme_supported(self, min_capability: int):
+    def _check_scheme_supported(self,
+                                min_capability: int,
+                                error: bool = True) -> bool:
         capability = current_platform.get_device_capability()
         capability = capability[0] * 10 + capability[1]
-        if capability < min_capability:
+        supported = capability >= min_capability
+        if error and not supported:
             raise RuntimeError(
                 "Quantization scheme is not supported for ",
                 f"the current GPU. Min capability: {min_capability}. ",
                 f"Current capability: {capability}.")
+        return supported
 
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
@@ -170,6 +175,29 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel,
         # All conditions satisfied.
         return True
 
+    def _is_fp8_w8a16(self, weight_quant: BaseModel,
+                      input_quant: BaseModel) -> bool:
+        # Confirm weights quantized.
+        if weight_quant is None:
+            return False
+
+        # Confirm we have floating points.
+        if weight_quant.type != QuantizationType.FLOAT:
+            return False
+
+        # Confirm weight scheme is supported.
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        ])
+        if not (is_symmetric_weight and is_static_weight
+                and is_per_tensor_or_channel_weight):
+            return False
+
+        # All conditions satisfied.
+        return True
+
     def _is_wNa16_group_channel(self, weight_quant: BaseModel,
                                 input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
@@ -204,9 +232,23 @@ def _get_scheme_from_parts(
         # Detect If Activation Quantization.
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
-                return CompressedTensorsW8A8Fp8(
+                is_fp8_w8a8_supported = self._check_scheme_supported(
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
+                if is_fp8_w8a8_supported:
+                    return CompressedTensorsW8A8Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=(not input_quant.dynamic))
+                else:
+                    return CompressedTensorsW8A16Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=(input_quant
+                                                and not input_quant.dynamic))
+
+            if self._is_fp8_w8a16(weight_quant, input_quant):
+                return CompressedTensorsW8A16Fp8(
                     strategy=weight_quant.strategy,
-                    is_static_input_scheme=(not input_quant.dynamic))
+                    is_static_input_scheme=(input_quant
+                                            and not input_quant.dynamic))
 
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8Int8(
@@ -257,11 +299,10 @@ def get_scheme(
             targets=self.target_scheme_map.keys())
 
         # Find the quant_scheme
-        scheme = self.target_scheme_map[matched_target]
-
-        return self._get_scheme_from_parts(
-            weight_quant=scheme["weights"],
-            input_quant=scheme["input_activations"])
+        scheme_dict = self.target_scheme_map[matched_target]
+        scheme = self._get_scheme_from_parts(
+            weight_quant=scheme_dict["weights"],
+            input_quant=scheme_dict["input_activations"])
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index dd94c49827f62..ca9e286ce5b2d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -4,6 +4,7 @@
                                           CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
+from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
 
@@ -11,6 +12,7 @@
     "CompressedTensorsScheme",
     "CompressedTensorsUnquantized",
     "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8",
     "CompressedTensorsW4A16Sparse24",
     "CompressedTensorsW8A8Int8",
     "CompressedTensorsW8A8Fp8",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index d5f37b47bb87e..b4bab33e1fb1d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -12,8 +12,9 @@ class CompressedTensorsScheme(ABC):
     of different quantization schemes supported by CompressedTensors.
     """
 
+    @classmethod
     @abstractmethod
-    def get_min_capability(self) -> int:
+    def get_min_capability(cls) -> int:
         """
         Get minimum device capability.
         """
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index 6203f02d25e90..b7ba29ddc9840 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -18,7 +18,8 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme):
     in a linear transformation.
     """
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # volta and up
         return 70
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index eec523d00372c..b8ffb22d7a89d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -29,7 +29,8 @@ def __init__(self,
             raise ValueError(
                 "group_size must be given when using strategy group")
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # ampere + up
         return 80
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
new file mode 100644
index 0000000000000..eeb7c042e1d1f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -0,0 +1,105 @@
+from typing import Callable, List, Optional
+
+import torch
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise, create_per_channel_scale_param,
+    create_per_tensor_scale_param)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW8A16Fp8"]
+
+SUPPORTED_STRATEGIES = [
+    QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR
+]
+
+
+class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # ampere and up
+        return 80
+
+    # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
+    # So if we have a fused module (QKV, MLP) with per tensor scales,
+    # we expand each scale to its shard's channels.
+    def process_weights_after_loading(self, layer) -> None:
+        if self.strategy == QuantizationStrategy.TENSOR:
+            ws_channelwise = convert_to_channelwise(layer.weight_scale,
+                                                    layer.logical_widths)
+            layer.weight_scale = torch.nn.Parameter(ws_channelwise,
+                                                    requires_grad=False)
+
+        # Weights must be transposed for marlin
+        layer.weight = torch.nn.Parameter(layer.weight.t(),
+                                          requires_grad=False)
+
+        prepare_fp8_layer_for_marlin(layer, strategy="channel")
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight = torch.nn.Parameter(torch.empty(output_size_per_partition,
+                                                input_size_per_partition,
+                                                dtype=torch.float8_e4m3fn),
+                                    requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+            "weight_loader": weight_loader,
+        })
+
+        # WEIGHT SCALE
+        layer_kwargs = {"weight_loader": weight_loader}
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = create_per_channel_scale_param(
+                output_partition_sizes, **layer_kwargs)
+        elif self.strategy == QuantizationStrategy.TENSOR:
+            weight_scale = create_per_tensor_scale_param(
+                output_partition_sizes, **layer_kwargs)
+        else:
+            raise ValueError(
+                f"Unsupported weight strategy={self.strategy}, "
+                f"supported strategies are {SUPPORTED_STRATEGIES}")
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE (to deal with converted checkpoints)
+        if self.is_static_input_scheme:
+            input_scale = create_per_tensor_scale_param(
+                output_partition_sizes, **layer_kwargs)
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return apply_fp8_marlin_linear(input=x,
+                                       weight=layer.weight,
+                                       weight_scale=layer.weight_scale,
+                                       workspace=layer.workspace,
+                                       size_n=layer.output_size_per_partition,
+                                       size_k=layer.input_size_per_partition,
+                                       bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 51156a3bc07af..cc9d71db140c2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -23,7 +23,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.is_static_input_scheme = is_static_input_scheme
         self.cutlass_fp8_supported = cutlass_fp8_supported()
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # lovelace and up
         return 89
 
@@ -77,19 +78,20 @@ def create_weights(self, layer: torch.nn.Module,
         })
 
         # WEIGHT SCALE
+        layer_kwargs = {"weight_loader": weight_loader}
         if self.strategy == QuantizationStrategy.CHANNEL:
             weight_scale = create_per_channel_scale_param(
-                output_partition_sizes, weight_loader=weight_loader)
+                output_partition_sizes, **layer_kwargs)
         else:
             assert self.strategy == QuantizationStrategy.TENSOR
             weight_scale = create_per_tensor_scale_param(
-                output_partition_sizes, weight_loader=weight_loader)
+                output_partition_sizes, **layer_kwargs)
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
             input_scale = create_per_tensor_scale_param(
-                output_partition_sizes, weight_loader=weight_loader)
+                output_partition_sizes, **layer_kwargs)
             layer.register_parameter("input_scale", input_scale)
 
     def apply_weights(self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index e81496c89ac7f..3a80863d3abbe 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -19,7 +19,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # turing and up
         return 75
 
@@ -68,19 +69,19 @@ def create_weights(self, layer: torch.nn.Module,
         # WEIGHT SCALE
         layer_kwargs = {"weight_loader": weight_loader}
         if self.strategy == QuantizationStrategy.CHANNEL:
-            scale = create_per_channel_scale_param(output_partition_sizes,
-                                                   **layer_kwargs)
+            weight_scale = create_per_channel_scale_param(
+                output_partition_sizes, **layer_kwargs)
         else:
             assert self.strategy == QuantizationStrategy.TENSOR
-            scale = create_per_tensor_scale_param(output_partition_sizes,
-                                                  **layer_kwargs)
-        layer.register_parameter("weight_scale", scale)
+            weight_scale = create_per_tensor_scale_param(
+                output_partition_sizes, **layer_kwargs)
+        layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            scale = create_per_tensor_scale_param(output_partition_sizes,
-                                                  **layer_kwargs)
-            layer.register_parameter("input_scale", scale)
+            input_scale = create_per_tensor_scale_param(
+                output_partition_sizes, **layer_kwargs)
+            layer.register_parameter("input_scale", input_scale)
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                       bias: Optional[torch.Tensor]) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index e4cf0c0b5d95b..996cba315c556 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -42,7 +42,8 @@ def __init__(self,
                                      group_size=self.group_size,
                                      is_sym=True)
 
-    def get_min_capability(self) -> int:
+    @classmethod
+    def get_min_capability(cls) -> int:
         # ampere and up
         return 80
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 3a4f2a49a3497..6649b317ca838 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -18,8 +18,9 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    all_close_1d, apply_fp8_linear, create_per_tensor_scale_param,
-    cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale)
+    all_close_1d, apply_fp8_linear, convert_to_channelwise,
+    create_per_tensor_scale_param, cutlass_fp8_supported,
+    per_tensor_dequantize, requantize_with_max_scale)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import print_warning_once
@@ -179,19 +180,29 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.input_scale = None
 
-        # If checkpoint is fp8, requantize the separately quantized logical
-        # weights into a single fp8 weight with a single weight scale.
+        # If checkpoint is fp8, handle that there are N scales for N
+        # shards in a fused module
         else:
-            # Dequant -> Quant with max scale.
-            max_w_scale, weight = requantize_with_max_scale(
-                weight=layer.weight,
-                weight_scale=layer.weight_scale,
-                logical_widths=layer.logical_widths,
-            )
+            # If using marlin (w8a16), kernel uses channelwise weights,
+            # so extend the weight scales to be channelwise.
+            if self.use_marlin:
+                weight = layer.weight
+                weight_scale = convert_to_channelwise(layer.weight_scale,
+                                                      layer.logical_widths)
+
+            # If using w8a8, torch._scaled_mm needs per tensor, so
+            # requantize the logical shards as a single weight.
+            else:
+                # Dequant -> Quant with max scale so we can run per tensor.
+                weight_scale, weight = requantize_with_max_scale(
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale,
+                    logical_widths=layer.logical_widths,
+                )
 
             # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
-            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             if self.quant_config.activation_scheme == "static":
                 layer.input_scale = Parameter(layer.input_scale.max(),
                                               requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index c878939580f10..5f9d8658a342f 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -46,7 +46,8 @@ def apply_fp8_marlin_linear(
     return output.reshape(out_shape)
 
 
-def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
+def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
+                                 strategy: str = "tensor") -> None:
     print_warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
@@ -74,16 +75,7 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
     layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
 
     # WEIGHT SCALES
-    # Currently Marlin doesn't support per-tensor scales, so we
-    # expand it to channelwise
-    is_channelwise = (len(layer.weight_scale.shape) > 0
-                      and layer.weight_scale.shape[0] == part_size_n)
-    if is_channelwise:
-        scales = layer.weight_scale
-    else:
-        scales = layer.weight_scale.repeat(1, part_size_n)
-    scales = scales.to(layer.orig_dtype).to(device)
-
+    scales = layer.weight_scale.to(layer.orig_dtype)
     # Permute scales
     marlin_scales = marlin_permute_scales(s=scales,
                                           size_k=part_size_k,

From 65b1f121c885f169da210946eddb0d52524677f1 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 25 Jul 2024 12:46:15 -0400
Subject: [PATCH 031/167] [Bugfix] Fix `kv_cache_dtype=fp8` without scales for
 FP8 checkpoints (#6761)

---
 tests/quantization/test_fp8.py                      | 12 ++++++++++--
 vllm/model_executor/layers/quantization/kv_cache.py |  6 ++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 0602fedf0b8e3..ad92f1f189f65 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -60,12 +60,20 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
-def test_load_fp16_model(vllm_runner) -> None:
-    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None:
+    with vllm_runner("facebook/opt-125m",
+                     quantization="fp8",
+                     kv_cache_dtype=kv_cache_dtype) as llm:
 
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         fc1 = model.model.decoder.layers[0].fc1
         assert isinstance(fc1.quant_method, Fp8LinearMethod)
+        if kv_cache_dtype == "fp8":
+            attn = model.model.decoder.layers[0].self_attn.attn
+            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+            assert attn._k_scale == 1.0
+            assert attn._v_scale == 1.0
 
         capability = torch.cuda.get_device_capability()
         capability = capability[0] * 10 + capability[1]
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index c1495711447fa..d79536d196b92 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -46,10 +46,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
                 # If no scales were loaded (both scales are invalid negative
                 # values), use the default value of 1.0
-                k_scale = torch.nn.Parameter(torch.tensor(1.0),
-                                             requires_grad=False)
-                v_scale = torch.nn.Parameter(torch.tensor(1.0),
-                                             requires_grad=False)
+                k_scale = 1.0
+                v_scale = 1.0
             else:
                 # If we find a single kv_scale in the checkpoint, we remap
                 # kv_scale to k_scale during weight loading, and duplicate

From 95db75de64bec34f4d80acff92c62d1cdfa94688 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 25 Jul 2024 13:40:01 -0400
Subject: [PATCH 032/167] [Bugfix] Add synchronize to prevent possible data
 race (#6788)

Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 vllm/distributed/parallel_state.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 128096c88a8b1..e9c6fc3a255e4 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -243,6 +243,13 @@ def graph_capture(
         ca_comm = self.ca_comm
         maybe_ca_context = nullcontext(
         ) if ca_comm is None else ca_comm.capture()
+
+        # ensure all initialization operations complete before attempting to
+        # capture the graph on another stream
+        curr_stream = torch.cuda.current_stream()
+        if curr_stream != stream:
+            stream.wait_stream(curr_stream)
+
         with torch.cuda.stream(stream), maybe_ca_context:
             # In graph mode, we have to be very careful about the collective
             # operations. The current status is:

From 6a1e25b1514a25d3da96d0d78c4568f6e581e242 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 25 Jul 2024 11:57:16 -0700
Subject: [PATCH 033/167] [Doc] Add documentations for nightly benchmarks
 (#6412)

---
 .buildkite/nightly-benchmarks/README.md       | 80 +++++++++++++++----
 README.md                                     |  2 +-
 docs/source/index.rst                         |  6 ++
 .../performance_benchmark/benchmarks.rst      | 23 ++++++
 4 files changed, 94 insertions(+), 17 deletions(-)
 create mode 100644 docs/source/performance_benchmark/benchmarks.rst

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index c84e150934306..c1aebaf5b3bbe 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -3,30 +3,51 @@
 
 ## Introduction
 
-This directory contains the performance benchmarking CI for vllm.
-The goal is to help developers know the impact of their PRs on the performance of vllm.
+This directory contains two sets of benchmark for vllm.
+- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
+- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 
-This benchmark will be *triggered* upon:
+
+See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+
+
+## Performance benchmark quick overview
+
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
+
+**Benchmarking Duration**: about 1hr.
+
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
+
+
+## Nightly benchmark quick overview
+
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. 
+
+**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
+
+**Benchmarking Duration**: about 3.5hrs.
+
+
+
+## Trigger the benchmark
+
+Performance benchmark will be triggered when:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label.
 
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
+Nightly benchmark will be triggered when:
+- Every commit for those PRs with `nightly-benchmarks` label.
 
-**Benchmarking Duration**: about 1hr.
 
-**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
 
 
-## Configuring the workload
+## Performance benchmark details
 
-The benchmarking workload contains three parts:
-- Latency tests in `latency-tests.json`.
-- Throughput tests in `throughput-tests.json`.
-- Serving tests in `serving-tests.json`.
+See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 
-See [descriptions.md](tests/descriptions.md) for detailed descriptions. 
 
-### Latency test
+#### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
 
@@ -54,12 +75,12 @@ Note that the performance numbers are highly sensitive to the value of the param
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
 
 
-### Throughput test
+#### Throughput test
 The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
 
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 
-### Serving test
+#### Serving test
 We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 
 ```
@@ -96,9 +117,36 @@ The number of this test is less stable compared to the delay and latency benchma
 
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 
-## Visualizing the results
+#### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+
+
+## Nightly test details
+
+See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
+
+
+#### Workflow
+
+- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. 
+- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
+- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
+- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+
+#### Nightly tests
+
+In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
+
+#### Docker containers
+
+The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
+
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
+
+WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
+
diff --git a/README.md b/README.md
index 8e508195cdceb..a9215f4c7e1c5 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ vLLM is fast with:
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels
 
-**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
 
 vLLM is flexible and easy to use with:
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ded9a424ee68c..8f06f2f2e5469 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -117,6 +117,12 @@ Documentation
    automatic_prefix_caching/apc
    automatic_prefix_caching/details
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Performance benchmarks
+
+   performance_benchmark/benchmarks
+
 .. toctree::
    :maxdepth: 2
    :caption: Developer Documentation
diff --git a/docs/source/performance_benchmark/benchmarks.rst b/docs/source/performance_benchmark/benchmarks.rst
new file mode 100644
index 0000000000000..9a23aab10d03d
--- /dev/null
+++ b/docs/source/performance_benchmark/benchmarks.rst
@@ -0,0 +1,23 @@
+.. _benchmarks:
+
+Benchmark suites of vLLM
+========================
+
+
+
+vLLM contains two sets of benchmarks:
+
++ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard <https://perf.vllm.ai>`_ for the latest performance results.
+
++ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README <https://github.com/vllm-project/vllm/blob/main/README.md>`_.
+
+
+Trigger a benchmark
+-------------------
+
+The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`.
+
+
+.. note::
+
+   Please refer to `vLLM performance benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/tests/descriptions.md>`_ and `vLLM nightly benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md>`_ for detailed descriptions on benchmark environment, workload and metrics.

From cd7edc4e8726d4b87e121f9ec671ecb6dd0c45d6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 25 Jul 2024 18:05:09 -0400
Subject: [PATCH 034/167] [Bugfix] Fix empty (nullptr) channelwise  scales when
 loading wNa16 using compressed tensors (#6798)

---
 .../schemes/compressed_tensors_wNa16.py               | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 996cba315c556..a41962ccd66d8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -55,7 +55,12 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         output_size_per_partition = sum(output_partition_sizes)
 
         # If group_size is -1, we are in channelwise case.
-        group_size = input_size if self.group_size == -1 else self.group_size
+        channelwise = (self.group_size == -1)
+        group_size = input_size if channelwise else self.group_size
+        row_parallel = (input_size != input_size_per_partition)
+        # In the case of channelwise quantization, we need to replicate the
+        # scales across all gpus.
+        partition_scales = (row_parallel and not channelwise)
 
         verify_marlin_supports_shape(
             output_size_per_partition=output_size_per_partition,
@@ -66,8 +71,8 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         weight_scale_dim = None
         scales_and_zp_size = input_size // group_size
 
-        if (input_size != input_size_per_partition
-                and self.group_size is not None):
+        if partition_scales:
+            assert input_size_per_partition % group_size == 0
             weight_scale_dim = 1
             scales_and_zp_size = input_size_per_partition // group_size
 

From f3ff63c3f45974986f13f60647a258b09913c420 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 25 Jul 2024 15:38:32 -0700
Subject: [PATCH 035/167] [doc][distributed] improve multinode serving doc
 (#6804)

---
 docs/source/serving/distributed_serving.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 4fc36a680084c..5f14fd2b0ee0a 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -79,7 +79,7 @@ On the rest of the worker nodes, run the following command:
     $                   --worker \
     $                   /path/to/the/huggingface/home/in/this/node
 
-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster.
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
 
 Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
@@ -101,7 +101,7 @@ You can also use tensor parallel without pipeline parallel, just set the tensor
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
 .. warning::
-    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information.
+    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion <https://github.com/vllm-project/vllm/issues/6803>`_ for more information.
 
 .. warning::
 

From b7215de2c5fcdf8af96cf941556d63934ea8f353 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 25 Jul 2024 16:47:55 -0700
Subject: [PATCH 036/167] [Docs] Publish 5th meetup slides (#6799)

---
 README.md                         | 10 +---------
 docs/source/community/meetups.rst |  1 +
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index a9215f4c7e1c5..5f23f0813f606 100644
--- a/README.md
+++ b/README.md
@@ -16,16 +16,8 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-**The Fifth vLLM Bay Area Meetup (July 24th 5pm-8pm PT)**
-
-We are excited to announce our fifth vLLM Meetup!
-Join us to hear the vLLM's recent updates and the upcoming roadmap.
-Additionally, our collaborators from AWS will be presenting their insights and experiences in deploying vLLM.
-Register now [here](https://lu.ma/lp0gyjqr) and be part of the event!
-
----
-
 *Latest News* 🔥
+- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
index 0fde31ef9b059..3b01b109ebf2c 100644
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
 - `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__

From 1adddb14bf0e1a603581bca49e8d29e8bfb337dc Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Thu, 25 Jul 2024 16:53:25 -0700
Subject: [PATCH 037/167] [Core] Fix ray forward_dag error mssg (#6792)

---
 vllm/executor/ray_gpu_executor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index e4aaeaa24c1bc..564fa79acfd40 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -29,6 +29,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
+        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -60,8 +61,6 @@ def _init_executor(self) -> None:
         # Create the parallel GPU workers.
         self._init_workers_ray(placement_group)
 
-        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
-
     def _configure_ray_workers_use_nsight(self,
                                           ray_remote_kwargs) -> Dict[str, Any]:
         # If nsight profiling is enabled, we need to set the profiling

From 443c7cf4cf891e6957d4b31655e58cabceb5a2a7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 25 Jul 2024 17:44:09 -0700
Subject: [PATCH 038/167] [ci][distributed] fix flaky tests (#6806)

---
 tests/distributed/test_pipeline_parallel.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index d666b8a1d44bd..5ff39ddfbf996 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -1,3 +1,10 @@
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
 import os
 
 import pytest
@@ -78,7 +85,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
         "--pipeline-parallel-size",
         str(PP_SIZE),
         "--distributed-executor-backend",
-        "ray",
+        "mp",
     ]
     os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
 

From 2eb9f4ff262bb39859baebf8d2109abcdadee860 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 25 Jul 2024 18:08:33 -0700
Subject: [PATCH 039/167] [ci] Mark tensorizer as soft fail and separate from
 grouped test (#6810)

[ci] Mark tensorizer test as soft fail and separate it from grouped test in fast check (#6810)
Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e7dd1fdb2e660..633bc5ca95bf9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -17,11 +17,10 @@ steps:
   - pytest -v -s test_utils.py # Utils
   - pytest -v -s worker # Worker
 
-- label: Tensorizer, Metrics, Tracing Test
+- label: Metrics, Tracing Test
   fast_check: true
   fast_check_only: true
   commands:
-  - apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
   - pytest -v -s metrics # Metrics
   - "pip install \
       opentelemetry-sdk \
@@ -221,6 +220,8 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
+  soft_fail: true
+  fast_check: true
   commands:
     - apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn

From 062a1d0fab111723ab768f94bdd48a6adc054007 Mon Sep 17 00:00:00 2001
From: QQSong <ustcsqq@gmail.com>
Date: Thu, 25 Jul 2024 19:24:58 -0700
Subject: [PATCH 040/167] Fix ReplicatedLinear weight loading (#6793)

---
 vllm/model_executor/layers/linear.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 0e0a2b72f93d4..b6e280ae65049 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -199,12 +199,16 @@ def __init__(self,
                                          self.input_size,
                                          self.output_size,
                                          self.params_dtype,
+                                         weight_loader=self.weight_loader,
                                          prefix=prefix)
 
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size, dtype=self.params_dtype))
-            set_weight_attrs(self.bias, {"output_dim": 0})
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
+            })
         else:
             self.register_parameter("bias", None)
 

From 084a01fd3544557990f8af8af6fd3c1185bae848 Mon Sep 17 00:00:00 2001
From: Anthony Platanios <e.a.platanios@gmail.com>
Date: Fri, 26 Jul 2024 00:25:35 -0400
Subject: [PATCH 041/167] [Bugfix] [Easy] Fixed a bug in the multiprocessing
 GPU executor. (#6770)

---
 vllm/executor/multiproc_gpu_executor.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 9811fc2a55199..19f7a497cdd9f 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import signal
+import threading
 import weakref
 from functools import partial
 from typing import Any, List, Optional
@@ -115,8 +116,9 @@ def shutdown(signum, frame):
             if executor := ref():
                 executor.shutdown()
 
-        signal.signal(signal.SIGINT, shutdown)
-        signal.signal(signal.SIGTERM, shutdown)
+        if threading.current_thread() is threading.main_thread():
+            signal.signal(signal.SIGINT, shutdown)
+            signal.signal(signal.SIGTERM, shutdown)
 
         self.driver_worker = self._create_worker(
             distributed_init_method=distributed_init_method)

From 89a84b0bb7b30706a02836234a94493ea8f780bf Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Fri, 26 Jul 2024 12:31:31 +0800
Subject: [PATCH 042/167] [Core] Use array to speedup padding (#6779)

---
 vllm/model_executor/layers/sampler.py    |  2 +-
 vllm/model_executor/sampling_metadata.py | 21 ++++++++++++---------
 vllm/sequence.py                         | 23 ++++++++++++++++-------
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 5c376797a054f..121458f8156a1 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -220,7 +220,7 @@ def _apply_min_tokens_penalty(
             seqs_to_penalize: List[int] = []
             for j, seq_id in enumerate(seq_ids):
                 seq_data = seq_group.seq_data[seq_id]
-                if len(seq_data.output_token_ids) < min_tokens:
+                if len(seq_data.output_token_ids_array) < min_tokens:
                     seqs_to_penalize.append(j)
 
             if seqs_to_penalize:
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 390b5d173ebcd..27b37a9d53470 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -1,4 +1,5 @@
 import random
+from array import array
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
@@ -329,8 +330,8 @@ def from_sampling_metadata(
             user-defined seed for each sequence.
         extra_entropy: extra entropy to use when generating seeds.
         """
-        prompt_tokens: List[List[int]] = []
-        output_tokens: List[List[int]] = []
+        prompt_tokens: List[array] = []
+        output_tokens: List[array] = []
         top_ks: List[int] = []
         temperatures: List[float] = []
         top_ps: List[float] = []
@@ -432,13 +433,15 @@ def from_sampling_metadata(
                 if (seq_group.is_prompt
                         and sampling_params.prompt_logprobs is not None):
                     prefill_len = len(seq_group.prompt_logprob_indices)
-                    prompt_tokens.extend([] for _ in range(prefill_len))
-                    output_tokens.extend([] for _ in range(prefill_len))
+                    prompt_tokens.extend(
+                        array('l') for _ in range(prefill_len))
+                    output_tokens.extend(
+                        array('l') for _ in range(prefill_len))
                 if seq_group.do_sample:
                     for seq_id in seq_ids:
                         seq_data = seq_group.seq_data[seq_id]
-                        prompt_tokens.append(list(seq_data.prompt_token_ids))
-                        output_tokens.append(list(seq_data.output_token_ids))
+                        prompt_tokens.append(seq_data.prompt_token_ids_array)
+                        output_tokens.append(seq_data.output_token_ids_array)
 
         sampling_tensors = SamplingTensors.from_lists(
             temperatures, top_ps, top_ks, min_ps, presence_penalties,
@@ -454,9 +457,9 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
                    frequency_penalties: List[float],
                    repetition_penalties: List[float],
                    sampling_seeds: List[int], sample_indices: List[int],
-                   prompt_tokens: List[List[int]],
-                   output_tokens: List[List[int]], vocab_size: int,
-                   extra_seeds_to_generate: int, device: torch.device,
+                   prompt_tokens: List[array], output_tokens: List[array],
+                   vocab_size: int, extra_seeds_to_generate: int,
+                   device: torch.device,
                    dtype: torch.dtype) -> "SamplingTensors":
         # Note that the performance will be very bad without
         # pinned memory.
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0cd4c7e71d78d..72821ecea0f47 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -3,6 +3,7 @@
 import enum
 import math
 from abc import ABC, abstractmethod
+from array import array
 from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Set, Tuple,
@@ -119,10 +120,10 @@ def __init__(
         prompt_token_ids: List[int],
         output_token_ids: Optional[List[int]] = None,
     ) -> None:
-        self._prompt_token_ids: List[int] = list(prompt_token_ids)
+        self._prompt_token_ids = array('l', prompt_token_ids)
         self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
-        self._output_token_ids: List[int] = (
-            list(output_token_ids) if output_token_ids is not None else [])
+        self._output_token_ids = array(
+            'l', output_token_ids if output_token_ids is not None else [])
 
         self.cumulative_logprob = 0.0
         # The number of tokens that are computed (that run against the model).
@@ -132,8 +133,8 @@ def __init__(
         self._update_cached_all_tokens()
 
     def _update_cached_all_tokens(self):
-        self._cached_all_token_ids: List[int] = (self._prompt_token_ids +
-                                                 self._output_token_ids)
+        self._cached_all_token_ids: List[int] = list(self._prompt_token_ids +
+                                                     self._output_token_ids)
 
     @property
     def prompt_token_ids(self) -> Tuple[int, ...]:
@@ -141,19 +142,27 @@ def prompt_token_ids(self) -> Tuple[int, ...]:
 
     @prompt_token_ids.setter
     def prompt_token_ids(self, new_prompt_token_ids) -> None:
-        self._prompt_token_ids = list(new_prompt_token_ids)
+        self._prompt_token_ids = array('l', new_prompt_token_ids)
         self._prompt_token_ids_tuple = tuple(new_prompt_token_ids)
         self._update_cached_all_tokens()
 
+    @property
+    def prompt_token_ids_array(self) -> array:
+        return self._prompt_token_ids
+
     @property
     def output_token_ids(self) -> Tuple[int, ...]:
         return tuple(self._output_token_ids)
 
     @output_token_ids.setter
     def output_token_ids(self, new_output_token_ids) -> None:
-        self._output_token_ids = list(new_output_token_ids)
+        self._output_token_ids = array('l', new_output_token_ids)
         self._update_cached_all_tokens()
 
+    @property
+    def output_token_ids_array(self) -> array:
+        return self._output_token_ids
+
     def append_token_id(self, token_id: int, logprob: float) -> None:
         self._output_token_ids.append(token_id)
         self._cached_all_token_ids.append(token_id)

From 85ad7e2d012edd87de9e84e93ed3204c80599695 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 25 Jul 2024 21:48:05 -0700
Subject: [PATCH 043/167] [doc][debugging] add known issues for hangs (#6816)

---
 docs/source/getting_started/debugging.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 2aa52e79888a3..d7066f2325b3a 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -65,6 +65,10 @@ Here are some common issues that can cause hangs:
 
 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
 
+Some known issues:
+
+- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_ .
+
 .. warning::
 
     After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.

From 07278c37ddd898d842bbddc382e4f67ac08dae35 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 26 Jul 2024 14:33:42 -0400
Subject: [PATCH 044/167] [Model] Support Nemotron models (Nemotron-3,
 Nemotron-4, Minitron) (#6611)

---
 .../configs/Minitron-4B-Base.yaml             |  11 +
 .../lm-eval-harness/configs/models-small.txt  |   1 +
 vllm/model_executor/layers/activation.py      |  16 +
 .../model_executor/layers/rotary_embedding.py |   3 +
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/nemotron.py        | 531 ++++++++++++++++++
 vllm/transformers_utils/config.py             |   3 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/nemotron.py   | 209 +++++++
 9 files changed, 776 insertions(+), 1 deletion(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
 create mode 100644 vllm/model_executor/models/nemotron.py
 create mode 100644 vllm/transformers_utils/configs/nemotron.py

diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
new file mode 100644
index 0000000000000..a0466748ea71e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
+model_name: "nvidia/Minitron-4B-Base"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.252
+  - name: "exact_match,flexible-extract"
+    value: 0.252
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 109692395acf6..e4df4b547aa5e 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -4,5 +4,6 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+Minitron-4B-Base.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 5bfdba67b443d..6578193a31597 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -159,6 +159,21 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
 
+class ReLUSquaredActivation(CustomOp):
+    """
+    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
+    """
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        relu_applied = nn.functional.relu(x)
+        squared = torch.square(relu_applied)
+        return squared
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
+
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
 
@@ -207,6 +222,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
     "gelu_new": NewGELU(),
     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
     "relu": nn.ReLU(),
+    "relu2": ReLUSquaredActivation(),
     "quick_gelu": QuickGELU(),
 }
 
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 60ba4623edc38..aecba0ae74911 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -774,6 +774,7 @@ def get_rope(
     is_neox_style: bool = True,
     rope_scaling: Optional[Dict[str, Any]] = None,
     dtype: Optional[torch.dtype] = None,
+    rotary_percent: float = 1.0,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
@@ -786,6 +787,8 @@ def get_rope(
         rope_scaling_args = tuple(rope_scaling_tuple.items())
     else:
         rope_scaling_args = None
+    if rotary_percent < 1.0:
+        rotary_dim = int(rotary_dim * rotary_percent)
     key = (head_size, rotary_dim, max_position, base, is_neox_style,
            rope_scaling_args, dtype)
     if key in _ROPE_DICT:
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 7df5b8fa64710..ead64c0e92553 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -51,6 +51,7 @@
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
+    "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
new file mode 100644
index 0000000000000..bb85f20ab9802
--- /dev/null
+++ b/vllm/model_executor/models/nemotron.py
@@ -0,0 +1,531 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Nemotron model compatible with HuggingFace weights."""
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.transformers_utils.configs import NemotronConfig
+
+from .interfaces import SupportsLoRA
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+# The architecture is pretty similar to Llama, with these changes:
+# - There is no gate_proj, just up_proj
+# - Normal LayerNorm (with a +1 to the weights) instead of RMSNorm
+# - Squared ReLU instead of SwiGLU
+# - Adds a rotary_percent to RoPE
+
+
+def _cast_if_autocast_enabled(*args):
+    if not torch.is_autocast_enabled():
+        return args
+    else:
+        return torch.cuda.amp.autocast_mode._cast(
+            args, torch.get_autocast_gpu_dtype())
+
+
+class NemotronLayerNorm1P(nn.LayerNorm):
+
+    def __init__(self,
+                 normalized_shape: Union[int, List[int], torch.Size],
+                 eps: float = 1e-5,
+                 elementwise_affine: bool = True,
+                 bias: bool = True,
+                 device=None,
+                 dtype=None):
+        super().__init__(normalized_shape, eps, elementwise_affine, bias,
+                         device, dtype)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if residual is not None:
+            x = x + residual
+            residual = x
+        args = _cast_if_autocast_enabled(x, self.normalized_shape,
+                                         self.weight + 1, self.bias, self.eps)
+        with torch.cuda.amp.autocast(enabled=False):
+            x = torch.nn.functional.layer_norm(*args)
+            return x if residual is None else (x, residual)
+
+
+class NemotronMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(input_size=hidden_size,
+                                            output_size=intermediate_size,
+                                            bias=bias,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.up_proj")
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_fn = get_act_fn(hidden_act)
+
+    def forward(self, x):
+        up, _ = self.up_proj(x)
+        x = self.act_fn(up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class NemotronAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(config, "head_dim",
+                                self.hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.rotary_percent = config.rope_percent
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            rotary_percent=self.rotary_percent,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class NemotronDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        self.self_attn = NemotronAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = NemotronMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = NemotronLayerNorm1P(config.hidden_size,
+                                                   eps=config.norm_eps)
+        self.post_attention_layernorm = NemotronLayerNorm1P(
+            config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class NemotronModel(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: NemotronDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers")
+        if get_pp_group().is_last_rank:
+            self.norm = NemotronLayerNorm1P(config.hidden_size,
+                                            eps=config.norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+                residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class NemotronForCausalLM(nn.Module, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+    }
+
+    def __init__(
+        self,
+        config: NemotronConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        assert isinstance(config, NemotronConfig)
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = NemotronModel(config,
+                                   cache_config,
+                                   quant_config,
+                                   lora_config=lora_config,
+                                   prefix="model")
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, kv_caches,
+                                  attn_metadata, intermediate_tensors)
+        return model_output
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 652505a892142..3ba2e01985598 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -8,7 +8,7 @@
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                              JAISConfig, MedusaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
-                                             RWConfig)
+                                             NemotronConfig, RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -26,6 +26,7 @@
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
+    "nemotron": NemotronConfig,
 }
 
 for name, cls in _CONFIG_REGISTRY.items():
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 51de11ca3e42a..1750950b3c38b 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -8,6 +8,7 @@
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.nemotron import NemotronConfig
 
 __all__ = [
     "ChatGLMConfig",
@@ -17,4 +18,5 @@
     "JAISConfig",
     "MedusaConfig",
     "MLPSpeculatorConfig",
+    "NemotronConfig",
 ]
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
new file mode 100644
index 0000000000000..a22a9f475dda9
--- /dev/null
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -0,0 +1,209 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Nemotron model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronModel`]. It is used to instantiate an Nemotron model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Nemotron-8B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be
+    used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Nemotron model. Defines the number of
+            different tokens that can be represented by the
+            `inputs_ids` passed when calling [`NemotronModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        head_dim (`int`, *optional*, defaults to None):
+            Projection weights dimension in multi-head attention. Set to
+            hidden_size // num_attention_heads if None
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention
+            (MQA) otherwise GQA is used. When converting a multi-head
+            checkpoint to a GQA checkpoint, each group key and value
+            head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout 
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
+            is not specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the
+            decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used
+            with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE
+            embeddings. Currently supports two scaling strategies: linear
+            and dynamic. Their scaling factor must be a float greater than 1.
+            The expected format is `{"type": strategy name,
+            "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj and down_proj layers in the MLP
+            layers.
+
+    ```python
+    >>> from transformers import NemotronModel, NemotronConfig
+
+    >>> # Initializing a Nemotron nemotron-15b style configuration
+    >>> configuration = NemotronConfig()
+
+    >>> # Initializing a model from the nemotron-15b style configuration
+    >>> model = NemotronModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nemotron"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=6144,
+        intermediate_size=24576,
+        num_hidden_layers=32,
+        num_attention_heads=48,
+        head_dim=None,
+        num_key_value_heads=None,
+        hidden_act="relu2",
+        max_position_embeddings=4096,
+        initializer_range=0.0134,
+        norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        rope_percent=0.5,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        head_dim = head_dim or kwargs.get("kv_channels", None)
+        self.head_dim = head_dim if head_dim is not None else (
+            hidden_size // num_attention_heads)
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        rope_percent = rope_percent or kwargs.get("rope_percentage", None)
+        self.rope_percent = rope_percent
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling,
+                          dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, "
+                f"`type` and `factor`, got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear", "dynamic"
+        ]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', "
+                f"'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(
+                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1, got "
+                f"{rope_scaling_factor}")

From 50704f52c4643777fb0e5dc99f6c048dd9f54f2d Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 26 Jul 2024 14:41:04 -0400
Subject: [PATCH 045/167] [Bugfix][Kernel] Promote another index to int64_t
 (#6838)

---
 csrc/quantization/fp8/common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 090f95d1bda71..6dae32b25f9c4 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -48,7 +48,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
                                         const scalar_t* __restrict__ input,
                                         int64_t num_elems) {
   __shared__ float cache[1024];
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
 
   // First store maximum for all values processes by
   // the current thread in cache[threadIdx.x]

From 71734f1bf263ed4877e928d7d9c4522d12e9c61f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 12:28:32 -0700
Subject: [PATCH 046/167] [Build/CI][ROCm] Minor simplification to
 Dockerfile.rocm (#6811)

---
 Dockerfile.rocm | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index ff39791456398..7b4c0166a04bd 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -53,9 +53,9 @@ RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(whic
 # Install torch == 2.5.0 on ROCm
 RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
         *"rocm-6.1"*) \
-            python3 -m pip uninstall -y torch torchaudio torchvision \
+            python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torch==2.5.0.dev20240710 \
                 torchvision==0.20.0.dev20240710 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
         *) ;; esac
@@ -127,13 +127,6 @@ FROM base AS final
 # Import the vLLM development directory from the build context
 COPY . .
 
-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually remove it so that later steps of numpy upgrade can continue
-RUN case "$(which python3)" in \
-        *"/opt/conda/envs/py_3.9"*) \
-            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
-        *) ;; esac
-
 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install --upgrade numba scipy huggingface-hub[cli]

From aa4867791ecd73a5f55b7bad4d9372954e661fe4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 12:39:49 -0700
Subject: [PATCH 047/167] [Misc][TPU] Support TPU in initialize_ray_cluster
 (#6812)

---
 vllm/executor/ray_utils.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index fcbfa30d7a38a..58b864070f727 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -3,7 +3,7 @@
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_ip, is_hip, is_xpu
+from vllm.utils import get_ip, is_hip, is_tpu, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -93,32 +93,38 @@ def initialize_ray_cluster(
         # Placement group is already set.
         return
 
+    device_str = "GPU" if not is_tpu() else "TPU"
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
         # We are in a placement group
         bundles = current_placement_group.bundle_specs
         # Verify that we can use the placement group.
-        gpu_bundles = 0
+        device_bundles = 0
         for bundle in bundles:
-            bundle_gpus = bundle.get("GPU", 0)
-            if bundle_gpus > 1:
+            bundle_devices = bundle.get(device_str, 0)
+            if bundle_devices > 1:
                 raise ValueError(
-                    "Placement group bundle cannot have more than 1 GPU.")
-            if bundle_gpus:
-                gpu_bundles += 1
-        if parallel_config.world_size > gpu_bundles:
+                    "Placement group bundle cannot have more than 1 "
+                    f"{device_str}.")
+            if bundle_devices:
+                device_bundles += 1
+        if parallel_config.world_size > device_bundles:
             raise ValueError(
-                "The number of required GPUs exceeds the total number of "
-                "available GPUs in the placement group.")
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group."
+                f"Required number of devices: {parallel_config.world_size}. "
+                f"Total number of devices: {device_bundles}.")
     else:
-        num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
-        if parallel_config.world_size > num_gpus_in_cluster:
+        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        if parallel_config.world_size > num_devices_in_cluster:
             raise ValueError(
-                "The number of required GPUs exceeds the total number of "
-                "available GPUs in the cluster.")
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group.")
         # Create a new placement group
-        placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size)
+        placement_group_specs = ([{
+            device_str: 1
+        }] * parallel_config.world_size)
         current_placement_group = ray.util.placement_group(
             placement_group_specs)
         # Wait until PG is ready - this will block until all

From 3bbb4936dc5aa7737750410ab4b4647817dcf9a3 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 27 Jul 2024 04:50:10 +0800
Subject: [PATCH 048/167] [Hardware] [Intel] Enable Multiprocessing and tensor
 parallel in CPU backend and update documentation  (#6125)

---
 .buildkite/run-cpu-test.sh                    |  28 +-
 Dockerfile.cpu                                |   9 +-
 cmake/cpu_extension.cmake                     |   4 +
 csrc/cpu/torch_bindings.cpp                   |   7 +
 csrc/cpu/utils.cpp                            |  65 +++++
 .../getting_started/cpu-installation.rst      |  55 +++-
 requirements-cpu.txt                          |   4 +-
 vllm/distributed/parallel_state.py            |   3 +
 vllm/engine/async_llm_engine.py               |   2 -
 vllm/envs.py                                  |   8 +-
 vllm/executor/cpu_executor.py                 | 264 +++++++++++++++---
 vllm/utils.py                                 |  21 --
 vllm/worker/cpu_model_runner.py               |   7 +-
 vllm/worker/cpu_worker.py                     |  17 +-
 14 files changed, 404 insertions(+), 90 deletions(-)
 create mode 100644 csrc/cpu/utils.cpp

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index a7678aae54644..21deec2bba973 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -3,26 +3,38 @@
 set -ex
 
 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.cpu .
-docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image
+# Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
-  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
-  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
-docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
-docker exec cpu-test bash -c "cd tests;
+docker exec cpu-test bash -c "
   pip install pytest Pillow protobuf
-  cd ../
   pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
+
+# online inference
+docker exec cpu-test bash -c "
+  export VLLM_CPU_KVCACHE_SPACE=10 
+  export VLLM_CPU_OMP_THREADS_BIND=48-92 
+  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+  python3 benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --dataset-name random \
+    --model facebook/opt-125m \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer facebook/opt-125m"
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index f95d748f1e4be..c473ba431e680 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -2,8 +2,8 @@
 
 FROM ubuntu:22.04 AS cpu-test-1
 
-RUN apt-get update  -y \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
+RUN apt-get update -y \
+    && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
@@ -13,8 +13,9 @@ RUN pip install intel-openmp
 
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
 
+RUN echo 'ulimit -c 0' >> ~/.bashrc
 
-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
 
 RUN pip install --upgrade pip \
     && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
@@ -25,7 +26,7 @@ COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
 
-RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu
 
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 690559ee265e9..118f9b28e0ae3 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -83,6 +83,8 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
+list(APPEND LIBS "numa")
+
 
 #
 # Define extension targets
@@ -95,6 +97,7 @@ set(VLLM_EXT_SRC
     "csrc/cpu/activation.cpp"
     "csrc/cpu/attention.cpp"
     "csrc/cpu/cache.cpp"
+    "csrc/cpu/utils.cpp"
     "csrc/cpu/layernorm.cpp"
     "csrc/cpu/pos_encoding.cpp"
     "csrc/cpu/torch_bindings.cpp")
@@ -104,6 +107,7 @@ define_gpu_extension_target(
     DESTINATION vllm
     LANGUAGE CXX
     SOURCES ${VLLM_EXT_SRC}
+    LIBRARIES ${LIBS}
     COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
     USE_SABI 3
     WITH_SOABI
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 5be0e9810b5b9..7d549e271a30d 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,6 +4,8 @@
 
 #include <torch/library.h>
 
+void init_cpu_threads_env(const std::string& cpu_ids);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -107,4 +109,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
 }
 
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
+  // CPU utils
+  utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
+}
+
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
new file mode 100644
index 0000000000000..5782580baa861
--- /dev/null
+++ b/csrc/cpu/utils.cpp
@@ -0,0 +1,65 @@
+#include <numa.h>
+#include <unistd.h>
+#include <string>
+#include <sched.h>
+
+#include "cpu_types.hpp"
+
+void init_cpu_threads_env(const std::string& cpu_ids) {
+  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
+  TORCH_CHECK(omp_cpu_mask->size > 0);
+  std::vector<int> omp_cpu_ids;
+  omp_cpu_ids.reserve(omp_cpu_mask->size);
+
+  constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp);
+
+  for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) {
+    unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size];
+    int i = 0;
+    while (group_mask) {
+      if (group_mask & 1) {
+        omp_cpu_ids.emplace_back(offset + i);
+      }
+      ++i;
+      group_mask >>= 1;
+    }
+  }
+
+  // Memory node binding
+  if (numa_available() != -1) {
+    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
+    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
+    bitmask* src_mask = numa_get_membind();
+
+    int pid = getpid();
+
+    // move all existing pages to the specified numa node.
+    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+    int page_num = numa_migrate_pages(pid, src_mask, mask);
+    if (page_num == -1) {
+      TORCH_CHECK(false,
+                  "numa_migrate_pages failed. errno: " + std::to_string(errno));
+    }
+
+    // restrict memory allocation node.
+    numa_set_membind(mask);
+    numa_set_strict(1);
+  }
+
+  // OMP threads binding
+  omp_set_num_threads((int)omp_cpu_ids.size());
+  torch::set_num_threads((int)omp_cpu_ids.size());
+  TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
+  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
+#pragma omp parallel for schedule(static, 1)
+  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
+    cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
+    size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
+    CPU_ZERO_S(size, mask);
+    CPU_SET_S(omp_cpu_ids[i], size, mask);
+    sched_setaffinity(0, sizeof(cpu_set_t), mask);
+    CPU_FREE(mask);
+  }
+
+  numa_free_nodemask(omp_cpu_mask);
+}
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index 1c97515dbecd9..7fc469e06844f 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -10,6 +10,7 @@ Table of contents:
 #. :ref:`Requirements <cpu_backend_requirements>`
 #. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
 #. :ref:`Build from source <build_cpu_backend_from_source>`
+#. :ref:`Related runtime environment variables <env_intro>`
 #. :ref:`Intel Extension for PyTorch <ipex_guidance>`
 #. :ref:`Performance tips <cpu_backend_performance_tips>`
 
@@ -47,7 +48,7 @@ Build from source
 .. code-block:: console
 
     $ sudo apt-get update  -y
-    $ sudo apt-get install -y gcc-12 g++-12
+    $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
     $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 - Second, install Python packages for vLLM CPU backend building:
@@ -71,6 +72,15 @@ Build from source
     
     - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.    
 
+.. _env_intro:
+
+Related runtime environment variables
+-------------------------------------
+
+- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+
+- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+
 .. _ipex_guidance:
 
 Intel Extension for PyTorch
@@ -78,15 +88,11 @@ Intel Extension for PyTorch
 
 - `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
 
-- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed.
-
 .. _cpu_backend_performance_tips:
 
 Performance tips
 -----------------
 
-- vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-
 - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
 
 .. code-block:: console
@@ -96,11 +102,44 @@ Performance tips
     $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
     $ python examples/offline_inference.py # run vLLM
 
-- vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. 
+- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
 
-- If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading.
+.. code-block:: console
+
+    $ export VLLM_CPU_KVCACHE_SPACE=40
+    $ export VLLM_CPU_OMP_THREADS_BIND=0-29 
+    $ vllm serve facebook/opt-125m
+
+- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+
+.. code-block:: console
 
-- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores and memory nodes, to avoid the remote memory node access. ``numactl`` is an useful tool for CPU core and memory binding on NUMA platform. Besides, ``--cpuset-cpus`` and ``--cpuset-mems`` arguments of ``docker run`` are also useful.
+    $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
+
+    # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. 
+    CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
+    0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+    8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+
+    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+    $ export VLLM_CPU_OMP_THREADS_BIND=0-7 
+    $ python examples/offline_inference.py
+
+- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access.
 
 
 
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 754070df21c0a..a8ce104d83290 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.1+cpu; platform_machine != "ppc64le"
-torchvision == 0.18.1+cpu; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+torch == 2.4.0; platform_machine != "ppc64le"
+torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e9c6fc3a255e4..58cae46d9af27 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -296,6 +296,9 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         pynccl_comm = self.pynccl_comm
         if (pynccl_comm is not None and not pynccl_comm.disabled):
             pynccl_comm.all_reduce(input_)
+        elif input_.is_cpu:
+            import intel_extension_for_pytorch as ipex
+            ipex.distributed.all_reduce(input_, group=self.device_group)
         else:
             torch.distributed.all_reduce(input_, group=self.device_group)
         return input_
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 16b7bc64a2849..93cc319f11c42 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -410,8 +410,6 @@ def _get_executor_cls(
             from vllm.executor.tpu_executor import TPUExecutorAsync
             executor_class = TPUExecutorAsync
         elif engine_config.device_config.device_type == "cpu":
-            assert distributed_executor_backend is None, (
-                "Distributed execution is not supported with the CPU backend.")
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
         elif engine_config.device_config.device_type == "openvino":
diff --git a/vllm/envs.py b/vllm/envs.py
index 595992e51db87..f06b6d66ea6f4 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -29,6 +29,7 @@
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_CPU_OMP_THREADS_BIND: str = ""
     VLLM_OPENVINO_KVCACHE_SPACE: int = 0
     VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
     VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
@@ -241,11 +242,16 @@ def get_default_config_root():
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
-    # CPU key-value cache space
+    # (CPU backend only) CPU key-value cache space.
     # default is 4GB
     "VLLM_CPU_KVCACHE_SPACE":
     lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
 
+    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
+    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
+    "VLLM_CPU_OMP_THREADS_BIND":
+    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
+
     # OpenVINO key-value cache space
     # default is 4GB
     "VLLM_OPENVINO_KVCACHE_SPACE":
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 23e429dac7232..3229e5ad20afa 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -1,16 +1,21 @@
-from typing import List, Set, Tuple
+import os
+from functools import partial
+from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
 import torch
 
 import vllm.envs as envs
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
+                                                  ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
+from vllm.utils import (get_distributed_init_method, get_open_port,
+                        get_vllm_instance_id, make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
@@ -22,46 +27,173 @@ class CPUExecutor(ExecutorBase):
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
+
+        #
+        # Environment variables for CPU executor
+        #
+
+        # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
+        os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
+
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            self.parallel_config.tensor_parallel_size)
+
         self.model_config = _verify_and_get_model_config(self.model_config)
         self.cache_config = _verify_and_get_cache_config(self.cache_config)
         self.scheduler_config = _verify_and_get_scheduler_config(
             self.scheduler_config)
 
-        # Instantiate the worker and load the model to CPU.
-        self._init_worker()
-
-    def _init_worker(self):
-        from vllm.worker.cpu_worker import CPUWorker
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # 127.0.0.1 for communication.
+        ip = "127.0.0.1"
+        port = get_open_port()
+        self.distributed_init_method = get_distributed_init_method(ip, port)
+
+        is_async = isinstance(self, CPUExecutorAsync)
+
+        world_size = self.parallel_config.tensor_parallel_size
+        result_handler = ResultHandler()
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        self.workers = []
+
+        if is_async:
+            self.workers = [
+                ProcessWorkerWrapper(
+                    result_handler,
+                    partial(
+                        self._create_worker,
+                        rank=rank,
+                        local_rank=rank,
+                    )) for rank in range(0, world_size)
+            ]
+            self.driver_worker = self.workers[0]
+            self.workers = self.workers[1:]
+            self.driver_method_invoker = _async_driver_method_invoker
+        else:
+            self.driver_worker = self._create_worker()
+            self.driver_method_invoker = _driver_method_invoker
+
+            if world_size != 1:
+                self.workers = [
+                    ProcessWorkerWrapper(
+                        result_handler,
+                        partial(
+                            self._create_worker,
+                            rank=rank,
+                            local_rank=rank,
+                        )) for rank in range(1, world_size)
+                ]
+
+        if world_size != 1 or is_async:
+            if is_async:
+                async_worker_list = self.workers + [self.driver_worker]
+            else:
+                async_worker_list = self.workers
+            self.worker_monitor = WorkerMonitor(async_worker_list,
+                                                result_handler)
+            result_handler.start()
+            self.worker_monitor.start()
+
+        self._run_workers("init_device")
+        self._run_workers("load_model")
+
+    def _create_worker(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+    ):
+        worker_module_name = "vllm.worker.cpu_worker"
+        worker_class_name = "CPUWorker"
+
+        wrapper = WorkerWrapperBase(
+            worker_module_name=worker_module_name,
+            worker_class_name=worker_class_name,
+        )
 
-        assert self.parallel_config.world_size == 1, (
-            "CPUExecutor only supports single CPU socket currently.")
+        assert self.distributed_init_method is not None
 
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        self.driver_worker = CPUWorker(
+        kwargs = dict(
             model_config=self.model_config,
             parallel_config=self.parallel_config,
             scheduler_config=self.scheduler_config,
             device_config=self.device_config,
             cache_config=self.cache_config,
             load_config=self.load_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=self.distributed_init_method,
             lora_config=self.lora_config,
             multimodal_config=self.multimodal_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             prompt_adapter_config=self.prompt_adapter_config,
-            is_driver_worker=True,
+            is_driver_worker=rank == 0,
         )
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
+        wrapper.init_worker(**kwargs)
+
+        return wrapper.worker
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_remote_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers.
+
+        Args:
+            async_run_remote_workers_only: If True the method will be run only
+                in the remote workers, not the driver worker. It will also be
+                run asynchronously and return a list of futures rather than
+                blocking on the results.
+        """
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        # Start the workers first.
+        worker_outputs = [
+            worker.execute_method(method, *args, **kwargs)
+            for worker in self.workers
+        ]
+
+        if async_run_remote_workers_only:
+            # Just return futures
+            return worker_outputs
+
+        driver_worker_output = self.driver_method_invoker(
+            self.driver_worker, method, *args, **kwargs)
+
+        # Get the results of the workers.
+        return [driver_worker_output
+                ] + [output.get() for output in worker_outputs]
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
         underlying worker.
         """
-        return self.driver_worker.determine_num_available_blocks()
+        return self.driver_method_invoker(self.driver_worker,
+                                          "determine_num_available_blocks")
 
     def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
@@ -74,43 +206,95 @@ def initialize_cache(self, num_gpu_blocks: int,
         # referred as `gpu block`. Because we want to reuse the existing block
         # management procedure.
         logger.info("# CPU blocks: %d", num_gpu_blocks)
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+        self._run_workers("initialize_cache",
+                          num_gpu_blocks=num_gpu_blocks,
+                          num_cpu_blocks=num_cpu_blocks)
 
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(execute_model_req)
+        if (self.parallel_config.tensor_parallel_size > 1
+                and self.parallel_worker_tasks is None):
+            self.parallel_worker_tasks = self._run_workers(
+                "start_worker_execution_loop",
+                async_run_remote_workers_only=True,
+            )
+        output = self.driver_method_invoker(self.driver_worker,
+                                            "execute_model", execute_model_req)
         return output
 
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+        """
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        self.driver_method_invoker(self.driver_worker, "execute_model", None)
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.driver_worker.add_lora(lora_request)
+        return all(self._run_workers("add_lora", lora_request))
 
     def remove_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.remove_lora(lora_id)
+        return all(self._run_workers("remove_lora", lora_id))
 
     def pin_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.pin_lora(lora_id)
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self._run_workers(
+            "pin_lora",
+            lora_id=lora_id,
+        ))
 
     def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
+        return self.driver_method_invoker(self.driver_worker, "list_loras")
 
     def add_prompt_adapter(
             self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
+        return all(
+            self._run_workers(
+                "add_prompt_adapter",
+                prompt_adapter_request,
+            ))
 
     def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
+        return all(
+            self._run_workers(
+                "remove_prompt_adapter",
+                prompt_adapter_id,
+            ))
 
     def list_prompt_adapters(self) -> Set[int]:
-        return self.driver_worker.list_prompt_adapters()
+        return self.driver_method_invoker(self.driver_worker,
+                                          "list_prompt_adapters")
 
     def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
+        return all(self._run_workers(
+            "pin_prompt_adapter",
+            prompt_adapter_id,
+        ))
 
     def check_health(self) -> None:
-        # CPUExecutor will always be healthy as long as
-        # it's running.
-        return
+        """Raises an error if engine is unhealthy."""
+        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
+        ):
+            raise RuntimeError("Worker processes are not running")
+
+    def shutdown(self):
+        if (worker_monitor := getattr(self, "worker_monitor",
+                                      None)) is not None:
+            worker_monitor.close()
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        for result in parallel_worker_tasks:
+            result.get()
 
 
 class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
@@ -118,14 +302,12 @@ class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
     async def execute_model_async(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
+        output = await make_async(self.execute_model
                                   )(execute_model_req=execute_model_req, )
         return output
 
     async def check_health_async(self) -> None:
-        # CPUExecutor will always be healthy as long as
-        # it's running.
-        return
+        self.check_health()
 
 
 def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
@@ -170,3 +352,11 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
             f" {kv_cache_space}, expect a positive integer value.")
 
     return config
+
+
+def _driver_method_invoker(driver, method: str, *args, **kwargs):
+    return getattr(driver, method)(*args, **kwargs)
+
+
+def _async_driver_method_invoker(driver, method: str, *args, **kwargs):
+    return driver.execute_method(method, *args, **kwargs).get()
diff --git a/vllm/utils.py b/vllm/utils.py
index 876c3bf90b02c..90be09fc7b967 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -404,27 +404,6 @@ def update_environment_variables(envs: Dict[str, str]):
         os.environ[k] = v
 
 
-def init_kmp_env():
-    if not is_cpu():
-        return
-
-    ld_prealod_str = os.getenv("LD_PRELOAD", "")
-    if "libiomp5.so" not in ld_prealod_str:
-        return
-
-    # The time(milliseconds) that a thread should wait after completing the
-    # execution of a parallel region, before sleeping.
-    os.environ['KMP_BLOCKTIME'] = "1"
-    # dump settings on start up
-    os.environ['KMP_SETTINGS'] = "1"
-    # Prevents the CPU to run into low performance state
-    os.environ['KMP_TPAUSE'] = "0"
-    # Provides fine granularity parallelism
-    os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
-    os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
-    os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
-
-
 def chunk_list(lst: List[T], chunk_size: int):
     """Yield successive chunk_size chunks from lst."""
     for i in range(0, len(lst), chunk_size):
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 83f4ba69fb728..71763c08ec45f 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -42,6 +42,7 @@ class CPUModelInput(ModelRunnerInputBase):
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
     multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    virtual_engine: Optional[int] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -204,8 +205,8 @@ def _prepare_prompt(
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
             seq_lens=seq_lens,
-            seq_lens_tensor=None,
-            max_decode_seq_len=None,
+            seq_lens_tensor=torch.tensor([]),
+            max_decode_seq_len=0,
             num_prefills=len(seq_lens),
             num_prefill_tokens=num_prompt_tokens,
             num_decode_tokens=0,
@@ -345,7 +346,7 @@ def prepare_model_input(
             multi_modal_kwargs=multi_modal_kwargs,
         )
 
-    @torch.inference_mode()
+    @torch.no_grad()
     def execute_model(
         self,
         model_input: CPUModelInput,
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3c22c73267b7f..735d48c908d61 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -4,6 +4,7 @@
 import torch
 import torch.distributed
 
+import vllm.envs as envs
 from vllm.attention import get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
@@ -13,7 +14,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, init_kmp_env
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.worker.cpu_model_runner import CPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerInput)
@@ -152,13 +153,18 @@ def __init__(
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
-        # try to initialize intel openmp optimized tunings
-        init_kmp_env()
-
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
+
+        # Setup OpenMP threads affinity.
+        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
+        if omp_cpuids == "all":
+            self.local_omp_cpuid = "all"
+        else:
+            self.local_omp_cpuid = omp_cpuids.split("|")[rank]
+
         self.model_runner: CPUModelRunner = CPUModelRunner(
             model_config,
             parallel_config,
@@ -177,6 +183,9 @@ def __init__(
         self.cpu_cache: List[List[torch.Tensor]]
 
     def init_device(self) -> None:
+        if self.local_omp_cpuid != "all":
+            torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+
         self.init_distributed_environment()
         # Set random seed.
         set_random_seed(self.model_config.seed)

From 281977bd6eccade50be461f5a22cc51b74006976 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 26 Jul 2024 17:32:44 -0400
Subject: [PATCH 049/167] [Doc] Add Nemotron to supported model docs (#6843)

---
 docs/source/models/supported_models.rst  | 4 ++++
 vllm/model_executor/layers/activation.py | 4 +---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index dc8bd6fb245df..483f552bba238 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -113,6 +113,10 @@ Decoder-only Language Models
     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
     -
+  * - :code:`NemotronForCausalLM`
+    - Nemotron-3, Nemotron-4, Minitron
+    - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
+    - ✅︎
   * - :code:`OLMoForCausalLM`
     - OLMo
     - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 6578193a31597..4c14fe476ee4a 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -166,9 +166,7 @@ class ReLUSquaredActivation(CustomOp):
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
-        relu_applied = nn.functional.relu(x)
-        squared = torch.square(relu_applied)
-        return squared
+        return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         return self.forward_native(x)

From 150a1ffbfd3d0429d30fa5ab841f53903a0a8a62 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 26 Jul 2024 17:39:10 -0400
Subject: [PATCH 050/167] [Doc] Update SkyPilot doc for wrong indents and
 instructions for update service (#4283)

---
 docs/source/serving/run_on_sky.rst | 430 ++++++++++++++++-------------
 1 file changed, 243 insertions(+), 187 deletions(-)

diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst
index bd33c76cec3de..674b14a879bc3 100644
--- a/docs/source/serving/run_on_sky.rst
+++ b/docs/source/serving/run_on_sky.rst
@@ -5,9 +5,9 @@ Deploying and scaling up with SkyPilot
 
 .. raw:: html
 
-    <p align="center">
-        <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
-    </p>
+  <p align="center">
+    <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
+  </p>
 
 vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery <https://skypilot.readthedocs.io/en/latest/gallery/index.html>`__.
 
@@ -21,8 +21,8 @@ Prerequisites
 
 .. code-block:: console
 
-    pip install skypilot-nightly
-    sky check
+  pip install skypilot-nightly
+  sky check
 
 
 Run on a single instance
@@ -32,64 +32,64 @@ See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypil
 
 .. code-block:: yaml
 
-    resources:
-        accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-        use_spot: True
-        disk_size: 512  # Ensure model checkpoints can fit.
-        disk_tier: best
-        ports: 8081  # Expose to internet traffic.
-
-    envs:
-        MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-        HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-    setup: |
-        conda create -n vllm python=3.10 -y
-        conda activate vllm
-
-        pip install vllm==0.4.0.post1
-        # Install Gradio for web UI.
-        pip install gradio openai
-        pip install flash-attn==2.5.7
-
-    run: |
-        conda activate vllm
-        echo 'Starting vllm api server...'
-        python -u -m vllm.entrypoints.openai.api_server \
-            --port 8081 \
-            --model $MODEL_NAME \
-            --trust-remote-code \
-            --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-            2>&1 | tee api_server.log &
-        
-        echo 'Waiting for vllm api server to start...'
-        while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
-
-        echo 'Starting gradio server...'
-        git clone https://github.com/vllm-project/vllm.git || true
-        python vllm/examples/gradio_openai_chatbot_webserver.py \
-            -m $MODEL_NAME \
-            --port 8811 \
-            --model-url http://localhost:8081/v1 \
-            --stop-token-ids 128009,128001
+  resources:
+    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+    use_spot: True
+    disk_size: 512  # Ensure model checkpoints can fit.
+    disk_tier: best
+    ports: 8081  # Expose to internet traffic.
+
+  envs:
+    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+  setup: |
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+
+    pip install vllm==0.4.0.post1
+    # Install Gradio for web UI.
+    pip install gradio openai
+    pip install flash-attn==2.5.7
+
+  run: |
+    conda activate vllm
+    echo 'Starting vllm api server...'
+    python -u -m vllm.entrypoints.openai.api_server \
+      --port 8081 \
+      --model $MODEL_NAME \
+      --trust-remote-code \
+      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+      2>&1 | tee api_server.log &
+    
+    echo 'Waiting for vllm api server to start...'
+    while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+    echo 'Starting gradio server...'
+    git clone https://github.com/vllm-project/vllm.git || true
+    python vllm/examples/gradio_openai_chatbot_webserver.py \
+      -m $MODEL_NAME \
+      --port 8811 \
+      --model-url http://localhost:8081/v1 \
+      --stop-token-ids 128009,128001
 
 Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): 
 
 .. code-block:: console
 
-    HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
+  HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
 
 Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
 
 .. code-block:: console
 
-    (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
+  (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
 
 **Optional**: Serve the 70B model instead of the default 8B and use more GPU:
 
 .. code-block:: console
 
-    HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+  HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
 
 
 Scale up to multiple replicas
@@ -99,151 +99,212 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
 
 .. code-block:: yaml
 
-    service:
-        replicas: 2
-        # An actual request for readiness probe.
-        readiness_probe:
-            path: /v1/chat/completions
-            post_data:
-            model: $MODEL_NAME
-            messages:
-                - role: user
-                content: Hello! What is your name?
-        max_tokens: 1
-        
+  service:
+    replicas: 2
+    # An actual request for readiness probe.
+    readiness_probe:
+      path: /v1/chat/completions
+      post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+    max_tokens: 1
+    
 .. raw:: html
 
-    <details>
-    <summary>Click to see the full recipe YAML</summary>
+  <details>
+  <summary>Click to see the full recipe YAML</summary>
 
 
 .. code-block:: yaml
 
-    service:
-        replicas: 2
-        # An actual request for readiness probe.
-        readiness_probe:
-            path: /v1/chat/completions
-            post_data:
-            model: $MODEL_NAME
-            messages:
-                - role: user
-                content: Hello! What is your name?
+  service:
+    replicas: 2
+    # An actual request for readiness probe.
+    readiness_probe:
+      path: /v1/chat/completions
+      post_data:
+        model: $MODEL_NAME
+        messages:
+          - role: user
+            content: Hello! What is your name?
         max_tokens: 1
 
-    resources:
-        accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-        use_spot: True
-        disk_size: 512  # Ensure model checkpoints can fit.
-        disk_tier: best
-        ports: 8081  # Expose to internet traffic.
-
-    envs:
-        MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-        HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-    setup: |
-        conda create -n vllm python=3.10 -y
-        conda activate vllm
-
-        pip install vllm==0.4.0.post1
-        # Install Gradio for web UI.
-        pip install gradio openai
-        pip install flash-attn==2.5.7
-
-    run: |
-        conda activate vllm
-        echo 'Starting vllm api server...'
-        python -u -m vllm.entrypoints.openai.api_server \
-            --port 8081 \
-            --model $MODEL_NAME \
-            --trust-remote-code \
-            --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-            2>&1 | tee api_server.log &
-        
-        echo 'Waiting for vllm api server to start...'
-        while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
-
-        echo 'Starting gradio server...'
-        git clone https://github.com/vllm-project/vllm.git || true
-        python vllm/examples/gradio_openai_chatbot_webserver.py \
-            -m $MODEL_NAME \
-            --port 8811 \
-            --model-url http://localhost:8081/v1 \
-            --stop-token-ids 128009,128001
+  resources:
+    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+    use_spot: True
+    disk_size: 512  # Ensure model checkpoints can fit.
+    disk_tier: best
+    ports: 8081  # Expose to internet traffic.
+
+  envs:
+    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+  setup: |
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+
+    pip install vllm==0.4.0.post1
+    # Install Gradio for web UI.
+    pip install gradio openai
+    pip install flash-attn==2.5.7
+
+  run: |
+    conda activate vllm
+    echo 'Starting vllm api server...'
+    python -u -m vllm.entrypoints.openai.api_server \
+      --port 8081 \
+      --model $MODEL_NAME \
+      --trust-remote-code \
+      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+      2>&1 | tee api_server.log
 
 .. raw:: html
 
-    </details>
+  </details>
 
 Start the serving the Llama-3 8B model on multiple replicas:
 
 .. code-block:: console
 
-    HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+  HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
 
 
 Wait until the service is ready:
 
 .. code-block:: console
 
-    watch -n10 sky serve status vllm
+  watch -n10 sky serve status vllm
 
 
 .. raw:: html
 
-    <details>
-    <summary>Example outputs:</summary>
+  <details>
+  <summary>Example outputs:</summary>
 
 .. code-block:: console
 
-    Services
-    NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
-    vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
+  Services
+  NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+  vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
 
-    Service Replicas
-    SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES          STATUS  REGION
-    vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP({'L4': 1})  READY   us-east4
-    vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP({'L4': 1})  READY   us-east4
+  Service Replicas
+  SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                STATUS  REGION
+  vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
+  vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 
 .. raw:: html
-    
-    </details>
+  
+  </details>
 
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
 
 .. code-block:: console
 
-    ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-    curl -L http://$ENDPOINT/v1/chat/completions \
-        -H "Content-Type: application/json" \
-        -d '{
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-            "messages": [
-            {
-                "role": "system",
-                "content": "You are a helpful assistant."
-            },
-            {
-                "role": "user",
-                "content": "Who are you?"
-            }
-            ],
-            "stop_token_ids": [128009,  128001]
-        }'
-
-To enable autoscaling, you could specify additional configs in `services`:
+  ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+  curl -L http://$ENDPOINT/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+      "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+      "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Who are you?"
+      }
+      ],
+      "stop_token_ids": [128009,  128001]
+    }'
+
+To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
 
 .. code-block:: yaml
 
-    services:
-        replica_policy:
-            min_replicas: 0
-            max_replicas: 3
-        target_qps_per_replica: 2
+  service:
+    replica_policy:
+      min_replicas: 2
+      max_replicas: 4
+      target_qps_per_replica: 2
 
 This will scale the service up to when the QPS exceeds 2 for each replica.
 
+    
+.. raw:: html
+
+  <details>
+  <summary>Click to see the full recipe YAML</summary>
+
+
+.. code-block:: yaml
+
+  service:
+    replica_policy:
+      min_replicas: 2
+      max_replicas: 4
+      target_qps_per_replica: 2
+    # An actual request for readiness probe.
+    readiness_probe:
+      path: /v1/chat/completions
+      post_data:
+        model: $MODEL_NAME
+        messages:
+          - role: user
+            content: Hello! What is your name?
+        max_tokens: 1
+
+  resources:
+    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+    use_spot: True
+    disk_size: 512  # Ensure model checkpoints can fit.
+    disk_tier: best
+    ports: 8081  # Expose to internet traffic.
+
+  envs:
+    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+  setup: |
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+
+    pip install vllm==0.4.0.post1
+    # Install Gradio for web UI.
+    pip install gradio openai
+    pip install flash-attn==2.5.7
+
+  run: |
+    conda activate vllm
+    echo 'Starting vllm api server...'
+    python -u -m vllm.entrypoints.openai.api_server \
+      --port 8081 \
+      --model $MODEL_NAME \
+      --trust-remote-code \
+      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+      2>&1 | tee api_server.log
+
+
+.. raw:: html
+  
+  </details>
+
+To update the service with the new config:
+
+.. code-block:: console
+
+  HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
+
+
+To stop the service:
+
+.. code-block:: console
+
+  sky serve down vllm
+
 
 **Optional**: Connect a GUI to the endpoint
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -253,58 +314,53 @@ It is also possible to access the Llama-3 service with a separate GUI frontend,
 
 .. raw:: html
 
-    <details>
-    <summary>Click to see the full GUI YAML</summary>
+  <details>
+  <summary>Click to see the full GUI YAML</summary>
 
 .. code-block:: yaml
 
-    envs:
-        MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
-        ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. 
-
-    resources:
-        cpus: 2
-
-    setup: |
-        conda activate vllm
-        if [ $? -ne 0 ]; then
-            conda create -n vllm python=3.10 -y
-            conda activate vllm
-        fi
-
-        # Install Gradio for web UI.
-        pip install gradio openai
-
-    run: |
-        conda activate vllm
-        export PATH=$PATH:/sbin
-        WORKER_IP=$(hostname -I | cut -d' ' -f1)
-        CONTROLLER_PORT=21001
-        WORKER_PORT=21002
-
-        echo 'Starting gradio server...'
-        git clone https://github.com/vllm-project/vllm.git || true
-        python vllm/examples/gradio_openai_chatbot_webserver.py \
-            -m $MODEL_NAME \
-            --port 8811 \
-            --model-url http://$ENDPOINT/v1 \
-            --stop-token-ids 128009,128001 | tee ~/gradio.log
+  envs:
+    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+    ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. 
+
+  resources:
+    cpus: 2
+
+  setup: |
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+
+    # Install Gradio for web UI.
+    pip install gradio openai
+
+  run: |
+    conda activate vllm
+    export PATH=$PATH:/sbin
+
+    echo 'Starting gradio server...'
+    git clone https://github.com/vllm-project/vllm.git || true
+    python vllm/examples/gradio_openai_chatbot_webserver.py \
+      -m $MODEL_NAME \
+      --port 8811 \
+      --model-url http://$ENDPOINT/v1 \
+      --stop-token-ids 128009,128001 | tee ~/gradio.log
+
 
 .. raw:: html
-    
-    </details>
+  
+  </details>
 
 1. Start the chat web UI:
 
 .. code-block:: console
 
-    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+  sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
 
 
 2. Then, we can access the GUI at the returned gradio link:
 
 .. code-block:: console
 
-    | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+  | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
 
 

From b5f49ee55beac7bb314fd8880bdb718ed117dacb Mon Sep 17 00:00:00 2001
From: Gurpreet Singh Dhami <143527450+gurpreet-dhami@users.noreply.github.com>
Date: Fri, 26 Jul 2024 20:26:45 -0400
Subject: [PATCH 051/167] Update README.md (#6847)

---
 examples/fp8/quantizer/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fp8/quantizer/README.md b/examples/fp8/quantizer/README.md
index 8f89a74a6a367..0b6944f688b49 100644
--- a/examples/fp8/quantizer/README.md
+++ b/examples/fp8/quantizer/README.md
@@ -16,7 +16,7 @@
 #### Run on H100 system for speed if FP8; number of GPUs depends on the model size
 
 #### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
-`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1`
+`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1`
 
 Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
 ```

From bb5494676f5f57f1cf7cf72598de5434a2a22865 Mon Sep 17 00:00:00 2001
From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com>
Date: Fri, 26 Jul 2024 18:32:20 -0700
Subject: [PATCH 052/167] enforce eager mode with bnb quantization temporarily
 (#6846)

---
 vllm/config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 6403a53f86281..92fde449b43fd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -282,6 +282,10 @@ def verify_with_parallel_config(
             raise ValueError(
                 "BitAndBytes quantization with TP or PP is not supported yet.")
 
+        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
+            raise ValueError(
+                "BitAndBytes with enforce_eager = False is not supported yet.")
+
     def get_hf_config_sliding_window(self) -> Optional[int]:
         """Get the sliding window size, or None if disabled."""
 

From d09b94ca588c6de1e627194264357e14460ae2eb Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 18:45:57 -0700
Subject: [PATCH 053/167] [TPU] Support collective communications in XLA
 devices (#6813)

---
 .../device_communicators/tpu_communicator.py  | 30 +++++++++++++++++++
 vllm/distributed/parallel_state.py            | 22 ++++++++++++++
 vllm/lora/layers.py                           |  4 +++
 .../model_executor/layers/logits_processor.py | 16 ++++++++--
 4 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 vllm/distributed/device_communicators/tpu_communicator.py

diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
new file mode 100644
index 0000000000000..69a9a516f3ebe
--- /dev/null
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -0,0 +1,30 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.platforms import current_platform
+
+if current_platform.is_tpu():
+    import torch_xla.core.xla_model as xm
+    from torch_xla._internal import pjrt
+
+
+class TpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not current_platform.is_tpu():
+            self.disabled = True
+            return
+        self.disabled = False
+
+        local_rank = dist.get_rank(group)
+        world_size = dist.get_world_size(group)
+        pjrt.initialize_multiprocess(local_rank, world_size)
+        xm._init_world_size_ordinal()
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        return xm.all_reduce(xm.REDUCE_SUM, x)
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        assert dim == -1, "TPUs only support dim=-1 for all-gather."
+        return xm.all_gather(x, dim=dim)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 58cae46d9af27..4116b1729d188 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -133,6 +133,7 @@ def __init__(
         torch_distributed_backend: Union[str, Backend],
         use_pynccl: bool,
         use_custom_allreduce: bool,
+        use_tpu_communicator: bool,
         use_message_queue_broadcaster: bool = False,
     ):
 
@@ -164,6 +165,7 @@ def __init__(
 
         self.use_pynccl = use_pynccl
         self.use_custom_allreduce = use_custom_allreduce
+        self.use_tpu_communicator = use_tpu_communicator
 
         # lazy import to avoid documentation build error
         from vllm.distributed.device_communicators.custom_all_reduce import (
@@ -190,6 +192,12 @@ def __init__(
         else:
             self.ca_comm = None
 
+        from vllm.distributed.device_communicators.tpu_communicator import (
+            TpuCommunicator)
+        self.tpu_communicator: Optional[TpuCommunicator]
+        if use_tpu_communicator and self.world_size > 1:
+            self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
+
         from vllm.distributed.device_communicators.shm_broadcast import (
             MessageQueue)
         self.mq_broadcaster: Optional[MessageQueue] = None
@@ -289,6 +297,12 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         # Bypass the function if we are using only 1 GPU.
         if self.world_size == 1:
             return input_
+
+        # For TPUs, use TPU communicator.
+        tpu_comm = self.tpu_communicator
+        if tpu_comm is not None and not tpu_comm.disabled:
+            return tpu_comm.all_reduce(input_)
+
         if ca_comm is not None:
             out = ca_comm.custom_all_reduce(input_)
             if out is not None:
@@ -310,6 +324,12 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
             return input_
         assert -input_.dim() <= dim < input_.dim(), (
             f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+
+        # For TPUs, use TPU communicator.
+        tpu_comm = self.tpu_communicator
+        if tpu_comm is not None and not tpu_comm.disabled:
+            return tpu_comm.all_gather(input_, dim)
+
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
@@ -727,6 +747,7 @@ def init_world_group(ranks: List[int], local_rank: int,
         torch_distributed_backend=backend,
         use_pynccl=False,
         use_custom_allreduce=False,
+        use_tpu_communicator=False,
     )
 
 
@@ -745,6 +766,7 @@ def init_model_parallel_group(
         torch_distributed_backend=backend,
         use_pynccl=True,
         use_custom_allreduce=use_custom_allreduce,
+        use_tpu_communicator=True,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
     )
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 40de134c0a5ee..87de285a373a2 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1067,6 +1067,10 @@ def scale(self):
     def soft_cap(self):
         return self.base_layer.soft_cap
 
+    @property
+    def use_gather(self):
+        return self.base_layer.use_gather
+
     @property
     def org_vocab_size(self):
         return self.base_layer.org_vocab_size
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index f6fcf49ef464b..bd3e7e114204f 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -5,10 +5,12 @@
 import torch
 import torch.nn as nn
 
-from vllm.distributed import tensor_model_parallel_gather
+from vllm.distributed import (tensor_model_parallel_all_gather,
+                              tensor_model_parallel_gather)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 
 
 class LogitsProcessor(nn.Module):
@@ -39,6 +41,8 @@ def __init__(self,
         self.org_vocab_size = org_vocab_size or vocab_size
         # Soft cap the logits. Used in Gemma 2.
         self.soft_cap = soft_cap
+        # Whether to use gather or all-gather to gather the logits.
+        self.use_gather = not current_platform.is_tpu()
 
     def forward(
         self,
@@ -76,7 +80,15 @@ def _get_logits(self, hidden_states: torch.Tensor,
         logits = lm_head.linear_method.apply(lm_head,
                                              hidden_states,
                                              bias=embedding_bias)
-        logits = tensor_model_parallel_gather(logits)
+        if self.use_gather:
+            logits = tensor_model_parallel_gather(logits)
+        else:
+            # Gather is not supported for some devices such as TPUs.
+            # Use all-gather instead.
+            # NOTE(woosuk): Here, the outputs of every device should not be None
+            # because XLA requires strict SPMD among all devices. Every device
+            # should execute the same operations after gathering the logits.
+            logits = tensor_model_parallel_all_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[:, :self.org_vocab_size]

From 981b0d567355063d5453e382a85970cae083c615 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 27 Jul 2024 09:58:25 +0800
Subject: [PATCH 054/167] [Frontend] Factor out code for running uvicorn
 (#6828)

---
 vllm/entrypoints/api_server.py        | 74 ++++++++++++++++++---------
 vllm/entrypoints/openai/api_server.py | 72 ++++++++------------------
 vllm/server/__init__.py               |  3 ++
 vllm/server/launch.py                 | 42 +++++++++++++++
 4 files changed, 116 insertions(+), 75 deletions(-)
 create mode 100644 vllm/server/__init__.py
 create mode 100644 vllm/server/launch.py

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 66941442c8c9c..3476357658522 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -5,12 +5,12 @@
 We are also not going to accept PRs modifying this file, please
 change `vllm/entrypoints/openai/api_server.py` instead.
 """
-
+import asyncio
 import json
 import ssl
-from typing import AsyncGenerator
+from argparse import Namespace
+from typing import Any, AsyncGenerator, Optional
 
-import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
@@ -18,8 +18,10 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
+from vllm.server import serve_http
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
 
@@ -81,6 +83,50 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     return JSONResponse(ret)
 
 
+def build_app(args: Namespace) -> FastAPI:
+    global app
+
+    app.root_path = args.root_path
+    return app
+
+
+async def init_app(
+    args: Namespace,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+) -> FastAPI:
+    app = build_app(args)
+
+    global engine
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = (llm_engine
+              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
+                  engine_args, usage_context=UsageContext.API_SERVER))
+
+    return app
+
+
+async def run_server(args: Namespace,
+                     llm_engine: Optional[AsyncLLMEngine] = None,
+                     **uvicorn_kwargs: Any) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    app = await init_app(args, llm_engine)
+    await serve_http(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+        ssl_ca_certs=args.ssl_ca_certs,
+        ssl_cert_reqs=args.ssl_cert_reqs,
+        **uvicorn_kwargs,
+    )
+
+
 if __name__ == "__main__":
     parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
@@ -105,25 +151,5 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     parser.add_argument("--log-level", type=str, default="debug")
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngine.from_engine_args(
-        engine_args, usage_context=UsageContext.API_SERVER)
-
-    app.root_path = args.root_path
 
-    logger.info("Available routes are:")
-    for route in app.routes:
-        if not hasattr(route, 'methods'):
-            continue
-        methods = ', '.join(route.methods)
-        logger.info("Route: %s, Methods: %s", route.path, methods)
-
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level=args.log_level,
-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
-                ssl_keyfile=args.ssl_keyfile,
-                ssl_certfile=args.ssl_certfile,
-                ssl_ca_certs=args.ssl_ca_certs,
-                ssl_cert_reqs=args.ssl_cert_reqs)
+    asyncio.run(run_server(args))
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 0fe4dd245b5e6..c1640a10a407d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,14 +2,12 @@
 import importlib
 import inspect
 import re
-import signal
+from argparse import Namespace
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Optional, Set
+from typing import Any, Optional, Set
 
-import fastapi
-import uvicorn
-from fastapi import APIRouter, Request
+from fastapi import APIRouter, FastAPI, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -38,6 +36,7 @@
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.logger import init_logger
+from vllm.server import serve_http
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
@@ -57,7 +56,7 @@
 
 
 @asynccontextmanager
-async def lifespan(app: fastapi.FastAPI):
+async def lifespan(app: FastAPI):
 
     async def _force_log():
         while True:
@@ -75,7 +74,7 @@ async def _force_log():
 router = APIRouter()
 
 
-def mount_metrics(app: fastapi.FastAPI):
+def mount_metrics(app: FastAPI):
     # Add prometheus asgi middleware to route /metrics requests
     metrics_route = Mount("/metrics", make_asgi_app())
     # Workaround for 307 Redirect for /metrics
@@ -165,8 +164,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-def build_app(args):
-    app = fastapi.FastAPI(lifespan=lifespan)
+def build_app(args: Namespace) -> FastAPI:
+    app = FastAPI(lifespan=lifespan)
     app.include_router(router)
     app.root_path = args.root_path
 
@@ -214,11 +213,8 @@ async def authentication(request: Request, call_next):
     return app
 
 
-async def build_server(
-    args,
-    llm_engine: Optional[AsyncLLMEngine] = None,
-    **uvicorn_kwargs,
-) -> uvicorn.Server:
+async def init_app(args: Namespace,
+                   llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI:
     app = build_app(args)
 
     if args.served_model_name is not None:
@@ -281,14 +277,17 @@ async def build_server(
     )
     app.root_path = args.root_path
 
-    logger.info("Available routes are:")
-    for route in app.routes:
-        if not hasattr(route, 'methods'):
-            continue
-        methods = ', '.join(route.methods)
-        logger.info("Route: %s, Methods: %s", route.path, methods)
+    return app
+
+
+async def run_server(args: Namespace,
+                     llm_engine: Optional[AsyncLLMEngine] = None,
+                     **uvicorn_kwargs: Any) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
 
-    config = uvicorn.Config(
+    app = await init_app(args, llm_engine)
+    await serve_http(
         app,
         host=args.host,
         port=args.port,
@@ -301,36 +300,6 @@ async def build_server(
         **uvicorn_kwargs,
     )
 
-    return uvicorn.Server(config)
-
-
-async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    logger.info("args: %s", args)
-
-    server = await build_server(
-        args,
-        llm_engine,
-        **uvicorn_kwargs,
-    )
-
-    loop = asyncio.get_running_loop()
-
-    server_task = loop.create_task(server.serve())
-
-    def signal_handler() -> None:
-        # prevents the uvicorn signal handler to exit early
-        server_task.cancel()
-
-    loop.add_signal_handler(signal.SIGINT, signal_handler)
-    loop.add_signal_handler(signal.SIGTERM, signal_handler)
-
-    try:
-        await server_task
-    except asyncio.CancelledError:
-        print("Gracefully stopping http server")
-        await server.shutdown()
-
 
 if __name__ == "__main__":
     # NOTE(simon):
@@ -339,4 +308,5 @@ def signal_handler() -> None:
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
     args = parser.parse_args()
+
     asyncio.run(run_server(args))
diff --git a/vllm/server/__init__.py b/vllm/server/__init__.py
new file mode 100644
index 0000000000000..17c98b4dad6c9
--- /dev/null
+++ b/vllm/server/__init__.py
@@ -0,0 +1,3 @@
+from .launch import serve_http
+
+__all__ = ["serve_http"]
diff --git a/vllm/server/launch.py b/vllm/server/launch.py
new file mode 100644
index 0000000000000..1a8aeb7f1022b
--- /dev/null
+++ b/vllm/server/launch.py
@@ -0,0 +1,42 @@
+import asyncio
+import signal
+from typing import Any
+
+import uvicorn
+from fastapi import FastAPI
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+async def serve_http(app: FastAPI, **uvicorn_kwargs: Any) -> None:
+    logger.info("Available routes are:")
+    for route in app.routes:
+        methods = getattr(route, "methods", None)
+        path = getattr(route, "path", None)
+
+        if methods is None or path is None:
+            continue
+
+        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
+
+    config = uvicorn.Config(app, **uvicorn_kwargs)
+    server = uvicorn.Server(config)
+
+    loop = asyncio.get_running_loop()
+
+    server_task = loop.create_task(server.serve())
+
+    def signal_handler() -> None:
+        # prevents the uvicorn signal handler to exit early
+        server_task.cancel()
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+    except asyncio.CancelledError:
+        logger.info("Gracefully stopping http server")
+        await server.shutdown()

From 55712941e57bcfd662db2905811d6e2807b9153f Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 26 Jul 2024 22:27:44 -0400
Subject: [PATCH 055/167] [Bug Fix] Illegal memory access, FP8 Llama 3.1 405b 
 (#6852)

---
 .../broadcast_load_epilogue_c3x.hpp           | 46 +++++++++++++++----
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
index 877a9f5b9e5de..e4bc9752ed7db 100644
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
@@ -328,20 +328,36 @@ struct Sm90ColOrScalarBroadcast {
     return EmptyProducerLoadCallbacks{};
   }
 
-  template<class GTensor, class RTensor>
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
   struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
     CUTLASS_DEVICE
-    ConsumerStoreCallbacks(GTensor&& tCgCol, RTensor&& tCrCol, Params const& params)
-      : tCgCol(cute::forward<GTensor>(tCgCol)),
-        tCrCol(cute::forward<RTensor>(tCrCol)),
-        params(params) {}
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      params(params) {}
 
     GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    RTensor tCrCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
     Params const& params;
+    int m;
 
     CUTLASS_DEVICE void
     begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
       if (!params.col_broadcast) {
         fill(tCrCol, *(params.ptr_col));
         return;
@@ -349,7 +365,7 @@ struct Sm90ColOrScalarBroadcast {
 
       // Filter so we don't issue redundant copies over stride-0 modes
       // (only works if 0-strides are in same location, which is by construction)
-      copy_aligned(filter(tCgCol), filter(tCrCol));
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
     }
 
     template <typename ElementAccumulator, int FragmentSize>
@@ -381,8 +397,20 @@ struct Sm90ColOrScalarBroadcast {
       mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
     Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
 
-    return ConsumerStoreCallbacks<decltype(tCgCol), decltype(tCrCol)>(
-      cute::move(tCgCol), cute::move(tCrCol), params);
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl, 
+      params
+    );
   }
 };
 

From 969d03226514d99f43cf7d17d1e336231d91751a Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Fri, 26 Jul 2024 23:02:25 -0400
Subject: [PATCH 056/167] [Bugfix]: Fix Tensorizer test failures (#6835)

---
 .buildkite/test-pipeline.yaml              |  1 -
 tests/tensorizer_loader/conftest.py        | 45 ++++++++++++++++++++++
 tests/tensorizer_loader/test_tensorizer.py |  7 +---
 3 files changed, 47 insertions(+), 6 deletions(-)
 create mode 100644 tests/tensorizer_loader/conftest.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 633bc5ca95bf9..5b4a786305e1f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -220,7 +220,6 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
-  soft_fail: true
   fast_check: true
   commands:
     - apt-get install -y curl libsodium23
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
new file mode 100644
index 0000000000000..c5c6fc1057d31
--- /dev/null
+++ b/tests/tensorizer_loader/conftest.py
@@ -0,0 +1,45 @@
+# isort: skip_file
+
+import contextlib
+import gc
+
+import pytest
+import ray
+import torch
+
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel)
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+
+
+def cleanup():
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+    ray.shutdown()
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    """Allow subdirectories to skip global cleanup by overriding this fixture.
+    This can provide a ~10x speedup for non-GPU unit tests since they don't need
+    to initialize torch.
+    """
+
+    return True
+
+
+@pytest.fixture(autouse=True)
+def cleanup_fixture(should_do_global_cleanup_after_test: bool):
+    yield
+    if should_do_global_cleanup_after_test:
+        cleanup()
+
+
+@pytest.fixture(autouse=True)
+def tensorizer_config():
+    config = TensorizerConfig(tensorizer_uri="vllm")
+    return config
\ No newline at end of file
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b7030e3cd6d42..2adeae8874bdb 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -40,7 +40,6 @@
 tensorize_model_for_testing_script = os.path.join(
     os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
 
-
 def is_curl_installed():
     try:
         subprocess.check_call(['curl', '--version'])
@@ -63,10 +62,6 @@ def write_keyfile(keyfile_path: str):
     with open(keyfile_path, 'wb') as f:
         f.write(encryption_params.key)
 
-@pytest.fixture(autouse=True)
-def tensorizer_config():
-    config = TensorizerConfig(tensorizer_uri="vllm")
-    return config
 
 
 @patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
@@ -105,6 +100,7 @@ def test_can_deserialize_s3(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
+    cleanup()
     with vllm_runner(model_ref) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
         key_path = tmp_path / (model_ref + ".key")
@@ -316,6 +312,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
 
 
 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
+    cleanup()
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))

From ced36cd89b9c012eb066ef863b2d1ecf052f3e00 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 20:16:13 -0700
Subject: [PATCH 057/167] [ROCm] Upgrade PyTorch nightly version (#6845)

---
 Dockerfile.rocm                                  | 4 ++--
 docs/source/getting_started/amd-installation.rst | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 7b4c0166a04bd..64bc0f3c12c75 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -55,8 +55,8 @@ RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
         *"rocm-6.1"*) \
             python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240710 \
-                torchvision==0.20.0.dev20240710 \
+                torch==2.5.0.dev20240726 \
+                torchvision==0.20.0.dev20240726 \
                --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
         *) ;; esac
 
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 1c7d274b7c47e..9648d07d2790c 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -117,7 +117,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
 
         $ # Install PyTorch
         $ pip uninstall torch -y
-        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
+        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
 
         $ # Build & install AMD SMI
         $ pip install /opt/rocm/share/amd_smi

From 3c3012398e4aecde9e40981d79a0576203158d24 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Fri, 26 Jul 2024 20:20:16 -0700
Subject: [PATCH 058/167] [Doc] add VLLM_TARGET_DEVICE=neuron to documentation
 for neuron (#6844)

Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
---
 docs/source/getting_started/neuron-installation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
index 62bf779c339d5..0816524468cab 100644
--- a/docs/source/getting_started/neuron-installation.rst
+++ b/docs/source/getting_started/neuron-installation.rst
@@ -131,6 +131,6 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
     $ pip install -U -r requirements-neuron.txt
-    $ pip install .
+    $ VLLM_TARGET_DEVICE="neuron" pip install .
 
 If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.

From ed94e4f427bce8611e198d051dbd3b0097b448e8 Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Sat, 27 Jul 2024 06:45:31 +0300
Subject: [PATCH 059/167] [Bugfix][Model] Jamba assertions and no chunked
 prefill by default for Jamba (#6784)

---
 vllm/engine/arg_utils.py            | 6 +++++-
 vllm/model_executor/models/jamba.py | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cd64d3345b830..bad5be4917216 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -754,10 +754,14 @@ def create_engine_config(self, ) -> EngineConfig:
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
                 use_spec_decode = self.speculative_model is not None
+                has_seqlen_agnostic_layers = (
+                    model_config.contains_seqlen_agnostic_layers(
+                        parallel_config))
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and not self.enable_prefix_caching):
+                        and not self.enable_prefix_caching
+                        and not has_seqlen_agnostic_layers):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index d4e4f0055aa2b..3444578227259 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -644,6 +644,11 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
         scheduler_config: Optional[SchedulerConfig] = None,
     ) -> None:
+        assert not scheduler_config.chunked_prefill_enabled, \
+            "Jamba currently does not support chunked prefill"
+        assert not cache_config.enable_prefix_caching, \
+            "Jamba currently does not support prefix caching"
+
         super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config

From 14dbd5a7674e5de2862c18adb711d9feecd35063 Mon Sep 17 00:00:00 2001
From: Joe <g-eoj@users.noreply.github.com>
Date: Fri, 26 Jul 2024 20:47:50 -0700
Subject: [PATCH 060/167] [Model] H2O Danube3-4b (#6451)

---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .../kernels/benchmark_paged_attention.py      |  2 +-
 benchmarks/kernels/benchmark_rope.py          |  2 +-
 csrc/attention/attention_kernels.cu           |  6 +++
 tests/kernels/test_attention.py               |  4 +-
 tests/kernels/test_cache.py                   |  8 ++-
 tests/kernels/test_pos_encoding.py            |  2 +-
 tests/models/test_danube3_4b.py               | 52 +++++++++++++++++++
 vllm/attention/ops/paged_attn.py              |  2 +-
 vllm/utils.py                                 |  6 +++
 10 files changed, 79 insertions(+), 7 deletions(-)
 create mode 100644 tests/models/test_danube3_4b.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 21deec2bba973..45bc8eb2f8477 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest Pillow protobuf
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 
 # online inference
 docker exec cpu-test bash -c "
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 78cac8a555d1b..a04433142da42 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -175,7 +175,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     parser.add_argument("--num-kv-heads", type=int, default=8)
     parser.add_argument("--head-size",
                         type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
                         default=128)
     parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
     parser.add_argument("--use-alibi", action="store_true")
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 78736c7a7ba6f..f542684a9a2a9 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -94,7 +94,7 @@ def benchmark_rope_kernels_multi_lora(
     parser.add_argument("--num-heads", type=int, default=8)
     parser.add_argument("--head-size",
                         type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
                         default=128)
     parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
     parser.add_argument("--dtype",
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 350dbce1d7ba9..875570a1e894f 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -751,6 +751,9 @@ void paged_attention_v1_launcher(
     case 112:
       LAUNCH_PAGED_ATTENTION_V1(112);
       break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V1(120);
+      break;
     case 128:
       LAUNCH_PAGED_ATTENTION_V1(128);
       break;
@@ -912,6 +915,9 @@ void paged_attention_v2_launcher(
     case 112:
       LAUNCH_PAGED_ATTENTION_V2(112);
       break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V2(120);
+      break;
     case 128:
       LAUNCH_PAGED_ATTENTION_V2(128);
       break;
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 2e6412c28958e..c7c6707461c3e 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -28,7 +28,7 @@
 
 # FlashAttention forward only supports head dimension at most 128
 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256
               ] if not is_hip() else [64, 80, 96, 112, 128]
 
 BLOCK_SIZES = [16, 32]
@@ -134,6 +134,8 @@ def test_paged_attention(
     seed: int,
     device: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index f9a609464abfc..3fb9b59be1701 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -11,7 +11,7 @@
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256]
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
 BLOCK_SIZES = [8, 16, 32]
 
 # Arbitrary values for testing
@@ -52,6 +52,8 @@ def test_copy_blocks(
     kv_cache_dtype: str,
     device: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
@@ -124,6 +126,8 @@ def test_reshape_and_cache(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
@@ -325,6 +329,8 @@ def test_swap_blocks(
 ) -> None:
     if kv_cache_dtype == "fp8" and "cpu" in direction:
         pytest.skip()
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index 4c83659929d41..4a7ad6e0fa21d 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -10,7 +10,7 @@
 
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256]
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
 NUM_HEADS = [7, 17]  # Arbitrary values for testing
 BATCH_SIZES = [1, 5]  # Arbitrary values for testing
diff --git a/tests/models/test_danube3_4b.py b/tests/models/test_danube3_4b.py
new file mode 100644
index 0000000000000..bfaa275f73c19
--- /dev/null
+++ b/tests/models/test_danube3_4b.py
@@ -0,0 +1,52 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+This tests danube3 separately because its head size isn't supported on CPU yet.
+
+Run `pytest tests/models/test_danube3_4b.py`.
+"""
+import pytest
+
+from .utils import check_outputs_equal
+
+MODELS = ["h2oai/h2o-danube3-4b-base"]
+
+target_dtype = "half"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [32])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [target_dtype])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index ce7b4d129779c..0f6d2f2d1ab3f 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -31,7 +31,7 @@ class PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 128, 192, 256]
+        return [64, 80, 96, 112, 120, 128, 192, 256]
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/utils.py b/vllm/utils.py
index 90be09fc7b967..1448316e66edb 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -508,6 +508,12 @@ def create_kv_caches_with_random(
     seed: int = 0,
     device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+
+    if cache_dtype == "fp8" and head_size % 16:
+        raise ValueError(
+            f"Does not support key cache of type fp8 with head_size {head_size}"
+        )
+
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)

From 52f07e3dec2b76045208f5cfea5670b85a719cc6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 26 Jul 2024 20:54:27 -0700
Subject: [PATCH 061/167] [Hardware][TPU] Implement tensor parallelism with Ray
 (#5871)

---
 requirements-tpu.txt              |   1 +
 vllm/attention/backends/pallas.py |   4 +-
 vllm/engine/llm_engine.py         |  10 +-
 vllm/executor/ray_tpu_executor.py | 313 ++++++++++++++++++++++++++++++
 vllm/worker/tpu_model_runner.py   |  42 ++--
 vllm/worker/tpu_worker.py         |  16 +-
 6 files changed, 365 insertions(+), 21 deletions(-)
 create mode 100644 vllm/executor/ray_tpu_executor.py

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 22487f5524dd7..c2140fbffec9f 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -4,4 +4,5 @@
 # Dependencies for TPU
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
+ray
 triton  # To avoid import errors
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index b83a83bb177d4..c53a2f91b89d7 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -55,8 +55,8 @@ class PallasMetadata(AttentionMetadata):
 
     # Currently, input sequences can only contain all prefills
     # or all decoding.
-    block_tables: Optional[torch.Tensor]
-    context_lens: Optional[torch.Tensor]
+    block_tables: Optional[torch.Tensor] = None
+    context_lens: Optional[torch.Tensor] = None
 
     @property
     def prefill_metadata(self) -> Optional["PallasMetadata"]:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 48d5305892219..004348d4c49a3 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -394,8 +394,14 @@ def _get_executor_cls(cls,
             from vllm.executor.neuron_executor import NeuronExecutor
             executor_class = NeuronExecutor
         elif engine_config.device_config.device_type == "tpu":
-            from vllm.executor.tpu_executor import TPUExecutor
-            executor_class = TPUExecutor
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_tpu_executor import RayTPUExecutor
+                executor_class = RayTPUExecutor
+            else:
+                assert distributed_executor_backend is None
+                from vllm.executor.tpu_executor import TPUExecutor
+                executor_class = TPUExecutor
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
new file mode 100644
index 0000000000000..7048d47980723
--- /dev/null
+++ b/vllm/executor/ray_tpu_executor.py
@@ -0,0 +1,313 @@
+import asyncio
+import os
+from collections import defaultdict
+from itertools import islice, repeat
+from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
+                    Union)
+
+import vllm.envs as envs
+from vllm.executor.executor_base import ExecutorAsyncBase
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.executor.tpu_executor import TPUExecutor
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        get_vllm_instance_id, make_async)
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+
+class RayTPUExecutor(TPUExecutor):
+
+    def __init__(self, *args, **kwargs):
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        # Updated by implementations that require additional args to be passed
+        # to the _run_workers execute_model call
+        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
+
+        super().__init__(*args, **kwargs)
+
+    def _init_executor(self) -> None:
+        assert self.parallel_config.distributed_executor_backend == "ray"
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel TPU workers.
+        self._init_workers_ray(placement_group)
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("TPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            assert self.speculative_config is None
+            worker_module_name = "vllm.worker.tpu_worker"
+            worker_class_name = "TPUWorker"
+
+            worker = ray.remote(
+                num_cpus=0,
+                resources={"TPU": 1},
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerWrapper).remote(
+                worker_module_name=worker_module_name,
+                worker_class_name=worker_class_name,
+                trust_remote_code=self.model_config.trust_remote_code,
+            )
+
+            worker_ip = ray.get(worker.get_node_ip.remote())
+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                # If the worker is on the same node as the driver, we use it
+                # as the resource holder for the driver process.
+                self.driver_dummy_worker = worker
+                self.driver_worker = RayWorkerWrapper(
+                    worker_module_name=worker_module_name,
+                    worker_class_name=worker_class_name,
+                    trust_remote_code=self.model_config.trust_remote_code,
+                )
+            else:
+                # Else, added to the list of workers.
+                self.workers.append(worker)
+
+        if self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any TPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "TPU node.")
+
+        # Get the set of TPU IDs used on each node.
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+                                                    use_dummy_driver=True)
+
+        node_workers = defaultdict(list)
+        for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+
+        VLLM_INSTANCE_ID = get_vllm_instance_id()
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [({
+            "VLLM_INSTANCE_ID":
+            VLLM_INSTANCE_ID,
+            "VLLM_TRACE_FUNCTION":
+            str(envs.VLLM_TRACE_FUNCTION),
+        }, ) for _ in worker_node_and_gpu_ids]
+        self._run_workers("update_environment_variables",
+                          all_args=all_args_to_update_environment_variables)
+
+        if len(node_workers) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Initialize the actual workers inside worker wrapper.
+        init_worker_all_kwargs = [
+            self._get_worker_kwargs(
+                local_rank=node_workers[node_id].index(rank),
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+        ]
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+
+        self._run_workers("init_device")
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+
+    def _driver_execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_remote_workers_only: bool = False,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        use_dummy_driver: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        use_ray_compiled_dag: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+
+        - async_run_remote_workers_only: If True the method will be run only
+          in the remote workers, not the driver worker. It will also be
+          run asynchronously and return a list of futures rather than blocking
+          on the results.
+        - args/kwargs: All workers share the same args/kwargs
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 1, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 1, None)
+
+        # Start the ray workers first.
+        ray_worker_outputs = [
+            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
+            for (worker, worker_args, worker_kwargs
+                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+        ]
+
+        if async_run_remote_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+
+        driver_args = args if all_args is None else all_args[0]
+        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+        # Start the driver worker after all the ray workers.
+        if not use_dummy_driver:
+            driver_worker_output = self.driver_worker.execute_method(
+                method, *driver_args, **driver_kwargs)
+        else:
+            assert self.driver_dummy_worker is not None
+            driver_worker_output = ray.get(
+                self.driver_dummy_worker.execute_method.remote(
+                    method, *driver_args, **driver_kwargs))
+        # Get the results of the ray workers.
+        if self.workers:
+            ray_worker_outputs = ray.get(ray_worker_outputs)
+
+        return [driver_worker_output] + ray_worker_outputs
+
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        num_blocks = self._run_workers("determine_num_available_blocks", )
+        num_tpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+        return num_tpu_blocks, num_cpu_blocks
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+        self._run_workers("initialize_cache",
+                          num_gpu_blocks=num_gpu_blocks,
+                          num_cpu_blocks=num_cpu_blocks)
+
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        if self.parallel_worker_tasks is None:
+            self.parallel_worker_tasks = self._run_workers(
+                "start_worker_execution_loop",
+                async_run_remote_workers_only=True,
+                **self.extra_execute_model_run_workers_kwargs)
+
+        # Only the driver worker returns the sampling results.
+        return self._driver_execute_model(execute_model_req)
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        self._driver_execute_model()
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+
+
+class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.driver_exec_method = make_async(self.driver_worker.execute_method)
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if self.parallel_worker_tasks is None:
+            # Start model execution loop running in the parallel workers
+            self.parallel_worker_tasks = asyncio.create_task(
+                self._start_worker_execution_loop())
+
+        # Only the driver worker returns the sampling results.
+        return await self._driver_execute_model_async(execute_model_req)
+
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        await self._driver_execute_model_async()
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        await parallel_worker_tasks
+
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        return await self.driver_exec_method("execute_model",
+                                             execute_model_req)
+
+    async def _start_worker_execution_loop(self):
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.workers
+        ]
+        return await asyncio.gather(*coros)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 8a8b412db6731..e5bb101fc7df4 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,6 +1,7 @@
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
+from unittest.mock import patch
 
 import numpy as np
 import torch
@@ -45,6 +46,7 @@ class ModelInputForTPU(ModelRunnerInputBase):
     num_samples: int
     best_of: List[int]
     seq_groups: List[List[int]]
+    virtual_engine: int = 0
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -55,6 +57,9 @@ def as_broadcastable_tensor_dict(
             "t": self.t,
             "p": self.p,
             "num_samples": self.num_samples,
+            "best_of": self.best_of,
+            "seq_groups": self.seq_groups,
+            "virtual_engine": self.virtual_engine,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -113,16 +118,30 @@ def __init__(
     def load_model(self) -> None:
         self.device = self.device_config.device
 
-        model = get_model(
-            model_config=self.model_config,
-            load_config=self.load_config,
-            device_config=self.device_config,
-            parallel_config=self.parallel_config,
-            cache_config=self.cache_config,
-            scheduler_config=self.scheduler_config,
-            multimodal_config=self.multimodal_config,
-            lora_config=None,
-        )
+        # NOTE(woosuk): While the executor assigns the TP ranks to the worker
+        # process, the ranks can be different from the ranks internally assigned
+        # by the xm runtime. Therefore, there is a mismatch in the rank
+        # assignment between the gloo (cpu) runtime and the xm (tpu) runtime.
+        # This is not a problem in linear layers because all-reduce is
+        # rank-agnostic. However, it matters for all-gather as the ranks
+        # determine the order of concatenating the output tensors.
+        # As a workaround, we use the xm's rank assignment only when loading
+        # the embedding weights.
+        xm_tp_rank = xm.get_ordinal()
+        with patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding."
+                "get_tensor_model_parallel_rank",
+                return_value=xm_tp_rank):
+            model = get_model(
+                model_config=self.model_config,
+                load_config=self.load_config,
+                device_config=self.device_config,
+                parallel_config=self.parallel_config,
+                cache_config=self.cache_config,
+                scheduler_config=self.scheduler_config,
+                multimodal_config=self.multimodal_config,
+                lora_config=None,
+            )
         model = model.eval()
         xm.wait_device_ops()
 
@@ -463,10 +482,11 @@ def make_model_input_from_broadcasted_tensor_dict(
             tensor_dict, attn_backend=self.attn_backend)
         return model_input
 
+    @torch.no_grad()
     def execute_model(
         self,
         model_input: ModelInputForTPU,
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        kv_caches: Optional[List[Any]],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
     ) -> List[SamplerOutput]:
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 03011e03058d8..c88aba7ae08cd 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -70,13 +70,13 @@ def __init__(
 
     def init_device(self) -> None:
         os.environ["PJRT_DEVICE"] = "TPU"
-        self.device = xm.xla_device()
-        self.device_config.device = self.device
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 
-        # NOTE(woosuk): This is just a hack to initialize the TP group.
-        # This cannot perform the actual communication ops.
+        # NOTE(woosuk): This is just to initialize the TP group and broadcast
+        # the input objects on CPU. The all-reduce and all-gather ops on TPU
+        # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
+        # own context.
         init_distributed_environment(
             world_size=self.parallel_config.world_size,
             rank=self.rank,
@@ -88,6 +88,11 @@ def init_device(self) -> None:
             self.parallel_config.tensor_parallel_size,
             self.parallel_config.pipeline_parallel_size)
 
+        # Device initialization should happen after initializing the distributed
+        # runtime.
+        self.device = xm.xla_device()
+        self.device_config.device = self.device
+
         # Set random seed.
         set_random_seed(self.model_config.seed)
         xm.set_rng_state(self.model_config.seed, self.device)
@@ -200,8 +205,7 @@ def get_cache_block_size_bytes(self) -> int:
 
     @property
     def do_metadata_broadcast(self) -> bool:
-        # TODO(woosuk): Support TP.
-        return False
+        return self.parallel_config.tensor_parallel_size > 1
 
     @property
     def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:

From c53041ae3b8ded4ac4c3fc745be6bc695b9f0c78 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 27 Jul 2024 05:47:33 +0100
Subject: [PATCH 062/167] [Doc] Add missing mock import to docs `conf.py`
 (#6834)

---
 .readthedocs.yaml   | 1 +
 docs/source/conf.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 428e199088589..f1959ad2743f3 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,6 +10,7 @@ build:
 
 sphinx:
    configuration: docs/source/conf.py
+   fail_on_warning: true
 
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f4cec05663fcd..b867bfd89dc17 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -94,6 +94,7 @@ def setup(app):
 
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
+    "aiohttp",
     "cpuinfo",
     "torch",
     "transformers",
@@ -141,5 +142,6 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
 }
 
 autodoc_preserve_defaults = True
+autodoc_warningiserror = True
 
 navigation_with_keys = False

From 593e79e7337f7fd9e92b7554dabdff96769dbf15 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Fri, 26 Jul 2024 23:15:20 -0600
Subject: [PATCH 063/167] [Bugfix] torch.set_num_threads() in
 multiproc_gpu_executor (#6802)

[Bugfix] Use torch.set_num_threads() to configure parallelism in multiproc_gpu_executor (#6802)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/executor/multiproc_gpu_executor.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 19f7a497cdd9f..e1e92958e667c 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -6,6 +6,8 @@
 from functools import partial
 from typing import Any, List, Optional
 
+import torch
+
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.gpu_executor import create_worker
@@ -45,10 +47,23 @@ def _init_executor(self) -> None:
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
-        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
-        # contention amongst the shards
-        if "OMP_NUM_THREADS" not in os.environ:
-            os.environ["OMP_NUM_THREADS"] = "1"
+        # Configure thread parallelism if OMP_NUM_THREADS isn't set
+        #
+        # Helps to avoid CPU contention. The default of spawning a thread per
+        # core combined with multiprocessing for each GPU can have a negative
+        # impact on performance. The contention is amplified when running in a
+        # container where CPU limits can cause throttling.
+        default_omp_num_threads = 1
+        if "OMP_NUM_THREADS" not in os.environ and (
+                current_parallelism :=
+                torch.get_num_threads()) > default_omp_num_threads:
+            logger.warning(
+                "Reducing Torch parallelism from %d threads to %d to avoid "
+                "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
+                "external environment to tune this value as needed.",
+                current_parallelism, default_omp_num_threads)
+            os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
+            torch.set_num_threads(default_omp_num_threads)
 
         # workaround for https://github.com/vllm-project/vllm/issues/6103
         if world_size > 1:

From aa46953a20685377fc51dcde172114ddd7ffdc68 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 26 Jul 2024 22:44:13 -0700
Subject: [PATCH 064/167] [Misc][VLM][Doc] Consolidate offline examples for
 vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 examples/fuyu_example.py                      |  31 ----
 examples/llava_example.py                     |  25 ---
 examples/llava_next_example.py                |  36 ----
 examples/minicpmv_example.py                  |  55 ------
 examples/offline_inference_vision_language.py | 174 ++++++++++++++++++
 examples/paligemma_example.py                 |  25 ---
 examples/phi3v_example.py                     |  40 ----
 7 files changed, 174 insertions(+), 212 deletions(-)
 delete mode 100644 examples/fuyu_example.py
 delete mode 100644 examples/llava_example.py
 delete mode 100644 examples/llava_next_example.py
 delete mode 100644 examples/minicpmv_example.py
 create mode 100644 examples/offline_inference_vision_language.py
 delete mode 100644 examples/paligemma_example.py
 delete mode 100644 examples/phi3v_example.py

diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py
deleted file mode 100644
index c92b8fb4bc286..0000000000000
--- a/examples/fuyu_example.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import requests
-from PIL import Image
-
-from vllm import LLM, SamplingParams
-
-
-def run_fuyu():
-    llm = LLM(model="adept/fuyu-8b", max_model_len=4096)
-
-    # single-image prompt
-    prompt = "What is the highest life expectancy at of male?\n"
-    url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
-    image = Image.open(requests.get(url, stream=True).raw)
-    sampling_params = SamplingParams(temperature=0, max_tokens=64)
-
-    outputs = llm.generate(
-        {
-            "prompt": prompt,
-            "multi_modal_data": {
-                "image": image
-            },
-        },
-        sampling_params=sampling_params)
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
-    run_fuyu()
diff --git a/examples/llava_example.py b/examples/llava_example.py
deleted file mode 100644
index 4c9eabd261e5c..0000000000000
--- a/examples/llava_example.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from vllm import LLM
-from vllm.assets.image import ImageAsset
-
-
-def run_llava():
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-
-    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-
-    image = ImageAsset("stop_sign").pil_image
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {
-            "image": image
-        },
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
-    run_llava()
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
deleted file mode 100644
index fd53a6def1a13..0000000000000
--- a/examples/llava_next_example.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from io import BytesIO
-
-import requests
-from PIL import Image
-
-from vllm import LLM, SamplingParams
-
-
-def run_llava_next():
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096)
-
-    prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
-    url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
-    image = Image.open(BytesIO(requests.get(url).content))
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=100)
-
-    outputs = llm.generate(
-        {
-            "prompt": prompt,
-            "multi_modal_data": {
-                "image": image
-            }
-        },
-        sampling_params=sampling_params)
-
-    generated_text = ""
-    for o in outputs:
-        generated_text += o.outputs[0].text
-
-    print(f"LLM output:{generated_text}")
-
-
-if __name__ == "__main__":
-    run_llava_next()
diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py
deleted file mode 100644
index bf20a7ea04ad4..0000000000000
--- a/examples/minicpmv_example.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from transformers import AutoTokenizer
-
-from vllm import LLM, SamplingParams
-from vllm.assets.image import ImageAsset
-
-# 2.0
-# The official repo doesn't work yet, so we need to use a fork for now
-# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
-# MODEL_NAME = "HwwwH/MiniCPM-V-2"
-# 2.5
-MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
-
-image = ImageAsset("stop_sign").pil_image.convert("RGB")
-
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-llm = LLM(model=MODEL_NAME,
-          gpu_memory_utilization=1,
-          trust_remote_code=True,
-          max_model_len=4096)
-
-messages = [{
-    'role':
-    'user',
-    'content':
-    '(<image>./</image>)\n' + "What's the content of the image?"
-}]
-prompt = tokenizer.apply_chat_template(messages,
-                                       tokenize=False,
-                                       add_generation_prompt=True)
-# 2.0
-# stop_token_ids = [tokenizer.eos_id]
-# 2.5
-stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
-
-sampling_params = SamplingParams(
-    stop_token_ids=stop_token_ids,
-    # temperature=0.7,
-    # top_p=0.8,
-    # top_k=100,
-    # seed=3472,
-    max_tokens=1024,
-    # min_tokens=150,
-    temperature=0,
-    use_beam_search=True,
-    # length_penalty=1.2,
-    best_of=3)
-
-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {
-        "image": image
-    }
-},
-                       sampling_params=sampling_params)
-print(outputs[0].outputs[0].text)
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
new file mode 100644
index 0000000000000..4a09f77ca59f8
--- /dev/null
+++ b/examples/offline_inference_vision_language.py
@@ -0,0 +1,174 @@
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+# Input image and question
+image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+question = "What is the content of this image?"
+
+
+# LLaVA-1.5
+def run_llava(question):
+
+    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+    return llm, prompt
+
+
+# LLaVA-1.6/LLaVA-NeXT
+def run_llava_next(question):
+
+    prompt = f"[INST] <image>\n{question} [/INST]"
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+
+    return llm, prompt
+
+
+# Fuyu
+def run_fuyu(question):
+
+    prompt = f"{question}\n"
+    llm = LLM(model="adept/fuyu-8b")
+
+    return llm, prompt
+
+
+# Phi-3-Vision
+def run_phi3v(question):
+
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (128k) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # In this example, we override max_num_seqs to 5 while
+    # keeping the original context length of 128k.
+    llm = LLM(
+        model="microsoft/Phi-3-vision-128k-instruct",
+        trust_remote_code=True,
+        max_num_seqs=5,
+    )
+    return llm, prompt
+
+
+# PaliGemma
+def run_paligemma(question):
+
+    prompt = question
+    llm = LLM(model="google/paligemma-3b-mix-224")
+
+    return llm, prompt
+
+
+# Chameleon
+def run_chameleon(question):
+
+    prompt = f"{question}<image>"
+    llm = LLM(model="facebook/chameleon-7b")
+    return llm, prompt
+
+
+# MiniCPM-V
+def run_minicpmv(question):
+
+    # 2.0
+    # The official repo doesn't work yet, so we need to use a fork for now
+    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
+    # model_name = "HwwwH/MiniCPM-V-2"
+
+    # 2.5
+    model_name = "openbmb/MiniCPM-Llama3-V-2_5"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+    )
+
+    messages = [{
+        'role': 'user',
+        'content': f'(<image>./</image>)\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    return llm, prompt
+
+
+model_example_map = {
+    "llava": run_llava,
+    "llava-next": run_llava_next,
+    "fuyu": run_fuyu,
+    "phi3_v": run_phi3v,
+    "paligemma": run_paligemma,
+    "chameleon": run_chameleon,
+    "minicpmv": run_minicpmv,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    llm, prompt = model_example_map[model](question)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models')
+    args = parser.parse_args()
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py
deleted file mode 100644
index 92a3cb3ac4129..0000000000000
--- a/examples/paligemma_example.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from vllm import LLM
-from vllm.assets.image import ImageAsset
-
-
-def run_paligemma():
-    llm = LLM(model="google/paligemma-3b-mix-224")
-
-    prompt = "caption es"
-
-    image = ImageAsset("stop_sign").pil_image
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {
-            "image": image
-        },
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
-    run_paligemma()
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
deleted file mode 100644
index ae8c38d84e8fd..0000000000000
--- a/examples/phi3v_example.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from vllm import LLM, SamplingParams
-from vllm.assets.image import ImageAsset
-
-
-def run_phi3v():
-    model_path = "microsoft/Phi-3-vision-128k-instruct"
-
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (128k) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # In this example, we override max_num_seqs to 5 while
-    # keeping the original context length of 128k.
-    llm = LLM(
-        model=model_path,
-        trust_remote_code=True,
-        max_num_seqs=5,
-    )
-
-    image = ImageAsset("cherry_blossom").pil_image
-
-    # single-image prompt
-    prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n"  # noqa: E501
-    sampling_params = SamplingParams(temperature=0, max_tokens=64)
-
-    outputs = llm.generate(
-        {
-            "prompt": prompt,
-            "multi_modal_data": {
-                "image": image
-            },
-        },
-        sampling_params=sampling_params)
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-if __name__ == "__main__":
-    run_phi3v()

From 925de97e05dd4709fcd80691cb37da5e582c22e8 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 26 Jul 2024 23:24:08 -0700
Subject: [PATCH 065/167] [Bugfix] Fix VLM example typo (#6859)

---
 examples/offline_inference_vision_language.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 4a09f77ca59f8..8a63653343db6 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -158,7 +158,6 @@ def main(args):
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models')
-    args = parser.parse_args()
     parser.add_argument('--model-type',
                         '-m',
                         type=str,

From a57d75821c6177da75fdebf171d528eef5301961 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= <wrran@outlook.com>
Date: Sat, 27 Jul 2024 17:07:02 +0800
Subject: [PATCH 066/167] [bugfix] make args.stream work (#6831)

---
 examples/api_client.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/api_client.py b/examples/api_client.py
index 27a2a08b7b0c3..49a085febdc57 100644
--- a/examples/api_client.py
+++ b/examples/api_client.py
@@ -31,7 +31,10 @@ def post_http_request(prompt: str,
         "max_tokens": 16,
         "stream": stream,
     }
-    response = requests.post(api_url, headers=headers, json=pload, stream=True)
+    response = requests.post(api_url,
+                             headers=headers,
+                             json=pload,
+                             stream=stream)
     return response
 
 

From ecb33a28cb6c10ebf3b1aa139f72e759cacb8c15 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 27 Jul 2024 02:54:14 -0700
Subject: [PATCH 067/167] [CI/Build][Doc] Update CI and Doc for VLM example
 changes (#6860)

---
 .buildkite/test-pipeline.yaml | 3 +--
 docs/source/models/vlm.rst    | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5b4a786305e1f..be8807df0b098 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -140,14 +140,13 @@ steps:
   working_dir: "/vllm-workspace/examples"
   mirror_hardwares: [amd]
   commands:
-    # install aws cli for llava_example.py
     # install tensorizer for tensorize_vllm_model.py
     - pip install awscli tensorizer
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_with_prefix.py
     - python3 llm_engine_example.py
-    - python3 llava_example.py
+    - python3 offline_inference_vision_language.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 
 - label: Inputs Test
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index ef4ce0d44a162..a385605c9f8f6 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -73,7 +73,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
         generated_text = o.outputs[0].text
         print(generated_text)
 
-A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
+A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 
 
 Online OpenAI Vision API Compatible Inference

From 1ad86acf1789650e2ff27586e36a8159d52755dd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 27 Jul 2024 19:53:07 +0800
Subject: [PATCH 068/167] [Model] Initial support for BLIP-2 (#5920)

Co-authored-by: ywang96 <ywang@roblox.com>
---
 docs/source/models/supported_models.rst       |   8 +
 examples/offline_inference_vision_language.py |  11 +
 examples/template_blip2.jinja                 |  11 +
 tests/models/test_blip2.py                    | 102 +++
 tests/models/test_fuyu.py                     |   8 +-
 tests/models/test_minicpmv.py                 |   8 +-
 tests/models/test_phi3v.py                    |   8 +-
 vllm/model_executor/models/__init__.py        |   6 +-
 vllm/model_executor/models/blip.py            | 269 +++++++
 vllm/model_executor/models/blip2.py           | 669 ++++++++++++++++++
 vllm/model_executor/models/opt.py             |  17 +-
 vllm/multimodal/base.py                       |  11 +-
 12 files changed, 1107 insertions(+), 21 deletions(-)
 create mode 100644 examples/template_blip2.jinja
 create mode 100644 tests/models/test_blip2.py
 create mode 100644 vllm/model_executor/models/blip.py
 create mode 100644 vllm/model_executor/models/blip2.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 483f552bba238..83c1b9c8bce86 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo
 The following is the list of model architectures that are currently supported by vLLM.
 Alongside each architecture, we include some popular models that use it.
 
+----
+
 Decoder-only Language Models
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. list-table::
@@ -186,6 +188,10 @@ Vision Language Models
     - Models
     - Example HuggingFace Models
     - :ref:`LoRA <lora>`
+  * - :code:`Blip2ForConditionalGeneration`
+    - BLIP-2
+    - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
+    -
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - :code:`facebook/chameleon-7b` etc.
@@ -215,6 +221,8 @@ Vision Language Models
     - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
     -
 
+----
+
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
 for instructions on how to implement support for your model.
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 8a63653343db6..04ba1a96314c9 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -106,6 +106,16 @@ def run_minicpmv(question):
     return llm, prompt
 
 
+# BLIP-2
+def run_blip2(question):
+
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="Salesforce/blip2-opt-2.7b")
+    return llm, prompt
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -114,6 +124,7 @@ def run_minicpmv(question):
     "paligemma": run_paligemma,
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
+    "blip-2": run_blip2,
 }
 
 
diff --git a/examples/template_blip2.jinja b/examples/template_blip2.jinja
new file mode 100644
index 0000000000000..fd41a7f7fa666
--- /dev/null
+++ b/examples/template_blip2.jinja
@@ -0,0 +1,11 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- 'Question: ' + message['content'] + ' ' -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Answer: ' + message['content'] + ' ' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{- 'Answer:' -}}
+{% endif %}
diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py
new file mode 100644
index 0000000000000..26afd57ae6106
--- /dev/null
+++ b/tests/models/test_blip2.py
@@ -0,0 +1,102 @@
+from typing import List, Optional, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import IMAGE_ASSETS
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "Question: What's the content of the image? Answer:",
+    "cherry_blossom":
+    "Question: What is the season? Answer:",
+})
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "\n"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(hf_output_str)
+    assert hf_output_ids[0] == tokenizer.bos_token_id
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py
index 25f63a3d64d0e..7d0f3be5ea008 100644
--- a/tests/models/test_fuyu.py
+++ b/tests/models/test_fuyu.py
@@ -77,8 +77,8 @@ def run_test(
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=vllm_images)
-            for prompts, vllm_images in inputs_per_image
+                                                images=images)
+            for prompts, images in inputs_per_image
         ]
 
     with hf_runner(model, dtype=dtype) as hf_model:
@@ -89,9 +89,9 @@ def run_test(
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
-                                                    images=hf_images,
+                                                    images=images,
                                                     eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs_per_image
+            for prompts, images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py
index 9124fa7a6238c..c57f0f8c08548 100644
--- a/tests/models/test_minicpmv.py
+++ b/tests/models/test_minicpmv.py
@@ -88,9 +88,9 @@ def run_test(
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=vllm_images,
+                                                images=images,
                                                 stop_token_ids=stop_token_ids)
-            for prompts, vllm_images in inputs_per_image
+            for prompts, images in inputs_per_image
         ]
 
     with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
@@ -114,9 +114,9 @@ def to(self, device: torch.types.Device):
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
-                                                    images=hf_images,
+                                                    images=images,
                                                     tokenizer=tokenizer)
-            for prompts, hf_images in inputs_per_image
+            for prompts, images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 9da25ab8d78fe..35ffe4ef50a85 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -101,8 +101,8 @@ def run_test(
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=vllm_images)
-            for prompts, vllm_images in inputs_per_image
+                                                images=images)
+            for prompts, images in inputs_per_image
         ]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
@@ -114,9 +114,9 @@ def run_test(
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
-                                                    images=hf_images,
+                                                    images=images,
                                                     eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs_per_image
+            for prompts, images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index ead64c0e92553..fe04c6db5fbc2 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -16,6 +16,8 @@
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
+    "Blip2ForConditionalGeneration":
+    ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration":
     ("chameleon", "ChameleonForConditionalGeneration"),
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
@@ -56,8 +58,8 @@
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
-    "PaliGemmaForConditionalGeneration":
-    ("paligemma", "PaliGemmaForConditionalGeneration"),
+    "PaliGemmaForConditionalGeneration": ("paligemma",
+                                          "PaliGemmaForConditionalGeneration"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
new file mode 100644
index 0000000000000..0b124d5e8a85a
--- /dev/null
+++ b/vllm/model_executor/models/blip.py
@@ -0,0 +1,269 @@
+"""Minimal implementation of BlipVisionModel intended to be only used 
+within a vision language model."""
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from PIL import Image
+from transformers import Blip2VisionConfig, BlipVisionConfig
+from transformers.models.blip.modeling_blip import BlipAttention
+
+from vllm.config import ModelConfig
+from vllm.inputs import LLMInputs
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal.image import (cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
+from vllm.sequence import SequenceData
+
+
+def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
+    assert image_size % patch_size == 0
+    return image_size // patch_size
+
+
+def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
+    grid_length = get_blip_patch_grid_length(image_size=image_size,
+                                             patch_size=patch_size)
+    return grid_length * grid_length
+
+
+def get_blip_image_feature_size(
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int:
+    return get_blip_num_patches(image_size=hf_config.image_size,
+                                patch_size=hf_config.patch_size)
+
+
+def get_max_blip_image_tokens(
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int:
+    return get_blip_image_feature_size(hf_config)
+
+
+def dummy_seq_data_for_blip(
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_blip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_blip(
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def input_processor_for_blip(
+    model_config: ModelConfig,
+    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
+    llm_inputs: LLMInputs,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    if image_feature_size_override is None:
+        image_feature_size = get_blip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        tokenizer,
+        llm_inputs.get("prompt"),
+        llm_inputs["prompt_token_ids"],
+        image_token_id=image_token_id,
+        repeat_count=image_feature_size,
+    )
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
+class BlipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = get_blip_num_patches(image_size=self.image_size,
+                                                patch_size=self.patch_size)
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+
+        position_embeds = self.position_embedding.to(target_dtype)
+        embeddings = embeddings + position_embeds[:, :embeddings.size(1), :]
+
+        return embeddings
+
+
+class BlipMLP(nn.Module):
+
+    def __init__(self,
+                 config: BlipVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+
+        self.config = config
+
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class BlipEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: BlipVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+
+        self.self_attn = BlipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+        self.mlp = BlipMLP(config, quant_config=quant_config)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
+                                        eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class BlipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self 
+    attention layers. Each layer is a [`BlipEncoderLayer`].
+
+    Args:
+        config: BlipConfig
+    """
+
+    def __init__(self,
+                 config: BlipVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList([
+            BlipEncoderLayer(config=config, quant_config=quant_config)
+            for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class BlipVisionModel(nn.Module):
+    config_class = BlipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self,
+                 config: BlipVisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+
+        self.config = config
+
+        self.embeddings = BlipVisionEmbeddings(config)
+        self.encoder = BlipEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+        )
+        self.post_layernorm = nn.LayerNorm(config.hidden_size,
+                                           eps=config.layer_norm_eps)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.encoder(inputs_embeds=hidden_states)
+
+        return self.post_layernorm(hidden_states)
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
new file mode 100644
index 0000000000000..e00e6c0806957
--- /dev/null
+++ b/vllm/model_executor/models/blip2.py
@@ -0,0 +1,669 @@
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+import torch.nn as nn
+from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
+                          apply_chunking_to_forward)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.opt import OPTModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+
+from .blip import (BlipVisionModel, dummy_image_for_blip,
+                   get_max_blip_image_tokens)
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.lm_head": "lm_head",
+    "language_model.model": "language_model",
+}
+
+
+class Blip2QFormerMultiHeadAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        is_cross_attention: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of "
+                f"the number of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = (config.hidden_size //
+                                    config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.scaling = self.attention_head_size**-0.5
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            kv_hidden_size = config.encoder_hidden_size
+        else:
+            kv_hidden_size = config.hidden_size
+        self.key = nn.Linear(kv_hidden_size, self.all_head_size)
+        self.value = nn.Linear(kv_hidden_size, self.all_head_size)
+
+        self.position_embedding_type = getattr(config,
+                                               "position_embedding_type",
+                                               "absolute")
+        if self.position_embedding_type != "absolute":
+            raise NotImplementedError("Unsupported position_embedding_type: "
+                                      f"{self.position_embedding_type}")
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        x = x.view(*x.size()[:-1], self.num_attention_heads,
+                   self.attention_head_size)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ):
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_probs = torch.softmax(attention_scores * self.scaling,
+                                        dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        context_layer = context_layer.view(*context_layer.size()[:-2],
+                                           self.all_head_size)
+
+        return context_layer
+
+
+class Blip2QFormerSelfOutput(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig) -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_tensor: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        is_cross_attention: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.attention = Blip2QFormerMultiHeadAttention(
+            config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            is_cross_attention=is_cross_attention,
+        )
+
+        self.output = Blip2QFormerSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.Tensor]:
+        self_output = self.attention(
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        attention_output = self.output(self_output, hidden_states)
+
+        return attention_output
+
+
+class Blip2QFormerIntermediate(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig) -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class Blip2QFormerOutput(nn.Module):
+
+    def __init__(self, config: Blip2QFormerConfig) -> None:
+        super().__init__()
+
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_tensor: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+        layer_idx: int,
+    ) -> None:
+        super().__init__()
+
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Blip2QFormerAttention(config,
+                                               quant_config=quant_config,
+                                               cache_config=cache_config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = Blip2QFormerAttention(
+                config,
+                quant_config=quant_config,
+                cache_config=cache_config,
+                is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = Blip2QFormerIntermediate(config)
+        self.output_query = Blip2QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        query_length: int,
+    ):
+        attention_output = self.attention(hidden_states)
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                query_attention_output = self.crossattention(
+                    query_attention_output,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text],
+                                         dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+
+        return layer_output
+
+    def feed_forward_chunk(self,
+                           attention_output: torch.Tensor) -> torch.Tensor:
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(
+            self, attention_output: torch.Tensor) -> torch.Tensor:
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class Blip2QFormerEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.layer = nn.ModuleList([
+            Blip2QFormerLayer(config,
+                              quant_config=quant_config,
+                              cache_config=cache_config,
+                              layer_idx=layer_idx)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        query_length: int,
+    ) -> torch.Tensor:
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+
+            hidden_states = layer_module(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                query_length=query_length,
+            )
+
+        return hidden_states
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025
+class Blip2QFormerModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Blip2QFormerConfig,
+        *,
+        quant_config: Optional[QuantizationConfig],
+        cache_config: Optional[CacheConfig],
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = Blip2QFormerEncoder(config,
+                                           quant_config=quant_config,
+                                           cache_config=cache_config)
+
+    def forward(
+        self,
+        query_embeds: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+    ) -> torch.Tensor:
+        query_length = query_embeds.shape[1]
+
+        embedding_output = self.layernorm(query_embeds)
+        embedding_output = self.dropout(embedding_output)
+
+        sequence_output = self.encoder(
+            embedding_output,
+            encoder_hidden_states=encoder_hidden_states,
+            query_length=query_length,
+        )
+
+        return sequence_output
+
+
+class Blip2ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, num_channels, height, width)"""
+
+
+Blip2ImageInputs = Blip2ImagePixelInputs
+
+# We use this internally as placeholders since there is no image token
+# defined on the HuggingFace repo
+BLIP2_IMAGE_TOKEN = "<image>"
+BLIP2_IMAGE_TOKEN_ID = 50265
+
+
+def get_blip2_image_feature_size(hf_config: Blip2Config) -> int:
+    return hf_config.num_query_tokens
+
+
+def get_max_blip2_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(Blip2Config)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, Blip2VisionConfig):
+        return get_max_blip_image_tokens(vision_config)
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def dummy_data_for_blip2(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(Blip2Config)
+    vision_config = hf_config.vision_config
+
+    image_feature_size = get_blip2_image_feature_size(hf_config)
+    token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    seq_data = SequenceData(token_ids)
+
+    if isinstance(vision_config, Blip2VisionConfig):
+        mm_data = dummy_image_for_blip(vision_config)
+
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def input_processor_for_blip2(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    hf_config = ctx.get_hf_config(Blip2Config)
+    image_feature_size = get_blip2_image_feature_size(hf_config)
+
+    # The original model places image tokens at the front
+    # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514
+    new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
+    new_token_ids += llm_inputs["prompt_token_ids"]
+
+    new_prompt = llm_inputs.get("prompt")
+    if new_prompt is not None:
+        new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt
+
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2)
+class Blip2ForConditionalGeneration(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: Blip2Config,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_model = BlipVisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(
+            torch.zeros(1, config.num_query_tokens,
+                        config.qformer_config.hidden_size))
+
+        self.qformer = Blip2QFormerModel(config.qformer_config,
+                                         cache_config=cache_config,
+                                         quant_config=quant_config)
+
+        self.language_projection = nn.Linear(
+            config.qformer_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+
+        self.quant_config = quant_config
+
+        self.language_model = OPTModel(config.text_config, cache_config,
+                                       quant_config)
+
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size)
+        self.sampler = Sampler()
+
+    def get_lm_head(self):
+        return self.language_model.decoder.embed_tokens
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return Blip2ImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def _image_pixels_to_features(self, vision_model: BlipVisionModel,
+                                  pixel_values: torch.Tensor) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_model(pixel_values)
+
+        return image_features
+
+    def _process_image_pixels(self,
+                              inputs: Blip2ImagePixelInputs) -> torch.Tensor:
+        assert self.vision_model is not None
+
+        pixel_values = inputs["data"]
+
+        return self._image_pixels_to_features(self.vision_model, pixel_values)
+
+    def _process_image_input(self,
+                             image_input: Blip2ImageInputs) -> torch.Tensor:
+        assert self.vision_model is not None
+        image_features = self._process_image_pixels(image_input)
+
+        query_tokens = self.query_tokens.expand(image_features.shape[0], -1,
+                                                -1)
+        query_output = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_features,
+        )
+
+        return self.language_projection(query_output)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for BLIP-2.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"Question: What's the content of the image? Answer:"`.
+
+        Tokenizer outputs:
+        `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends 
+        dummy tokens (denoted as `50265`), resulting in:
+        `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`.
+
+        We insert 32 tokens since it corresponds to the number of query
+        embeddings outputted by the Q-Former and inputted to the language model.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each input image.
+        
+        See also:
+            :class:`Blip2ImageInputs`
+        """
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            vision_embeddings = self._process_image_input(image_input)
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+                                                    vision_embeddings,
+                                                    BLIP2_IMAGE_TOKEN_ID)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.get_lm_head(), hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "lm_head.weight" in name:
+                continue
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_model is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    param = params_dict[name.replace(weight_name, param_name)]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    use_default_weight_loading = True
+            if use_default_weight_loading:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index edc16710c0229..a05090cd46648 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -237,14 +237,19 @@ def __init__(
             for _ in range(config.num_hidden_layers)
         ])
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        inputs_embeds = self.embed_tokens(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
         pos_embeds = self.embed_positions(positions)
         if self.project_in is not None:
             inputs_embeds, _ = self.project_in(inputs_embeds)
@@ -272,14 +277,22 @@ def __init__(
         super().__init__()
         self.decoder = OPTDecoder(config, cache_config, quant_config)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.decoder.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return self.decoder(input_ids, positions, kv_caches, attn_metadata)
+        return self.decoder(input_ids,
+                            positions,
+                            kv_caches,
+                            attn_metadata,
+                            inputs_embeds=inputs_embeds)
 
 
 class OPTForCausalLM(nn.Module):
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 0d435bd644e29..5abd0ad61cdf9 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,8 +1,9 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict,
-                    TypeVar, Union, cast)
+from typing import Any, Callable, Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Type, TypedDict, TypeVar, Union, cast
 
 import torch
 import torch.types
@@ -15,13 +16,13 @@
 
 logger = init_logger(__name__)
 
-NestedTensors = Union[List[torch.Tensor], torch.Tensor]
+NestedTensors = Union[GenericSequence[torch.Tensor], torch.Tensor]
 """
 Use a list instead of a tensor if the dimensions of each element do not match.
 Currently only supports up to singly nested list of tensors.
 """
 
-BatchedTensors = Union[List[NestedTensors], NestedTensors]
+BatchedTensors = Union[GenericSequence[NestedTensors], NestedTensors]
 """
 If each input tensor in the batch has the same size, this is a single batched
 tensor; otherwise, this is a list of :class:`NestedTensors` with one element
@@ -53,7 +54,7 @@ def try_concat(
         # may be list rather than tensors
         if isinstance(tensors[0], list):
             return [[t.to(device=device) for t in tensor[0]]
-                    for tensor in tensors]
+                    for tensor in cast(List[List[torch.Tensor]], tensors)]
 
         tensors_ = cast(List[torch.Tensor], tensors)
 

From f954d0715c8b68e780aac4a4f3ffd1ab56bebfcd Mon Sep 17 00:00:00 2001
From: Chenggang Wu <cgwu0530@gmail.com>
Date: Sat, 27 Jul 2024 09:24:46 -0700
Subject: [PATCH 069/167] [Docs] Add RunLLM chat widget (#6857)

---
 docs/source/_static/custom.js | 16 ++++++++++++++++
 docs/source/conf.py           |  2 ++
 2 files changed, 18 insertions(+)
 create mode 100644 docs/source/_static/custom.js

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
new file mode 100644
index 0000000000000..f475be71fc448
--- /dev/null
+++ b/docs/source/_static/custom.js
@@ -0,0 +1,16 @@
+document.addEventListener("DOMContentLoaded", function () {
+    var script = document.createElement("script");
+    script.type = "module";
+    script.id = "runllm-widget-script"
+  
+    script.src = "https://widget.runllm.com";
+  
+    script.setAttribute("version", "stable");
+    script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
+    script.setAttribute("runllm-name", "vLLM");
+    script.setAttribute("runllm-position", "BOTTOM_RIGHT");
+    script.setAttribute("runllm-assistant-id", "207");
+  
+    script.async = true;
+    document.head.appendChild(script);
+  });
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b867bfd89dc17..1093b30bca11d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -68,6 +68,8 @@
     'use_repository_button': True,
     'use_edit_page_button': True,
 }
+html_static_path = ["_static"]
+html_js_files = ["custom.js"]
 
 # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
 READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')

From fad5576c58864a6c2cf528f67e60e03a949b3dac Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 27 Jul 2024 10:28:33 -0700
Subject: [PATCH 070/167] [TPU] Reduce compilation time & Upgrade PyTorch XLA
 version  (#6856)

---
 Dockerfile.tpu                                    |  2 +-
 docs/source/getting_started/tpu-installation.rst  |  9 ++++++++-
 vllm/attention/backends/pallas.py                 |  1 -
 .../device_communicators/tpu_communicator.py      |  3 ++-
 vllm/worker/tpu_model_runner.py                   | 15 +++++++++++++--
 vllm/worker/tpu_worker.py                         |  1 -
 6 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index be7dbe63cb237..4fc14d6bd186c 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20240713"
+ARG NIGHTLY_DATE="20240726"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 5e2f514a4a509..2e6c522422c22 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -56,7 +56,7 @@ First, install the dependencies:
     $ pip uninstall torch torch-xla -y
 
     $ # Install PyTorch and PyTorch XLA.
-    $ export DATE="+20240713"
+    $ export DATE="+20240726"
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
 
@@ -75,6 +75,13 @@ Next, build vLLM from source. This will only take a few seconds:
     $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
 
 
+.. note::
+
+    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape.
+    The compilation time may take 20~30 minutes in the first run.
+    However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
+
+
 .. tip::
 
     If you encounter the following error:
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index c53a2f91b89d7..2269ac2606e89 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch_xla.experimental.custom_kernel  # Required to register custom ops.
-import torch_xla.experimental.dynamo_set_buffer_donor
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 69a9a516f3ebe..16525887cf4eb 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -6,6 +6,7 @@
 
 if current_platform.is_tpu():
     import torch_xla.core.xla_model as xm
+    import torch_xla.runtime as xr
     from torch_xla._internal import pjrt
 
 
@@ -20,7 +21,7 @@ def __init__(self, group: ProcessGroup):
         local_rank = dist.get_rank(group)
         world_size = dist.get_world_size(group)
         pjrt.initialize_multiprocess(local_rank, world_size)
-        xm._init_world_size_ordinal()
+        xr._init_world_size_ordinal()
 
     def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
         return xm.all_reduce(xm.REDUCE_SUM, x)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index e5bb101fc7df4..1692094af8c41 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn as nn
 import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
@@ -127,7 +128,7 @@ def load_model(self) -> None:
         # determine the order of concatenating the output tensors.
         # As a workaround, we use the xm's rank assignment only when loading
         # the embedding weights.
-        xm_tp_rank = xm.get_ordinal()
+        xm_tp_rank = xr.global_ordinal()
         with patch(
                 "vllm.model_executor.layers.vocab_parallel_embedding."
                 "get_tensor_model_parallel_rank",
@@ -146,7 +147,17 @@ def load_model(self) -> None:
         xm.wait_device_ops()
 
         model = ModelWrapper(model)
-        self.model = torch.compile(model, backend="openxla", fullgraph=True)
+        # NOTE(woosuk): There are two stages of compilation: torch.compile and
+        # XLA compilation. Setting dynamic=True can reduce the torch.compile
+        # overhead by reusing the FX graph for different shapes.
+        # However, the XLA graph will still require static shapes and needs to
+        # be re-compiled for every different shapes. This overhead is inevitable
+        # in the first run, but can be skipped afterwards as we cache the XLA
+        # graphs in the disk (VLLM_XLA_CACHE_PATH).
+        self.model = torch.compile(model,
+                                   backend="openxla",
+                                   fullgraph=True,
+                                   dynamic=True)
 
     def _dummy_run(
         self,
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index c88aba7ae08cd..17fa5c35457c2 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch_xla.core.xla_model as xm
-import torch_xla.experimental.dynamo_set_buffer_donor  # noqa: F401
 import torch_xla.runtime as xr
 
 import vllm.envs as envs

From 75acdaa4b616c2e95c55a47d3158ceec9c72c503 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Sat, 27 Jul 2024 17:52:33 -0400
Subject: [PATCH 071/167] [Kernel] Increase precision of GPTQ/AWQ Marlin kernel
 (#6795)

---
 benchmarks/kernels/benchmark_marlin.py        |  23 ++-
 csrc/ops.h                                    |   3 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 150 ++++++++++++++----
 tests/kernels/test_marlin_gemm.py             |  13 +-
 vllm/_custom_ops.py                           |   6 +-
 .../layers/quantization/utils/marlin_utils.py |  17 +-
 6 files changed, 168 insertions(+), 44 deletions(-)

diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 3da4cecd7eeff..684985b81f690 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -10,7 +10,7 @@
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+    MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     MarlinWorkspace, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
@@ -56,6 +56,8 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
      marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
 
+    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
+
     # GPTQ quant
     (w_ref, q_w, s, g_idx,
      rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
@@ -87,6 +89,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
         "marlin_w_ref": marlin_w_ref,
         "marlin_q_w": marlin_q_w,
         "marlin_s": marlin_s,
+        "marlin_zp": marlin_zp,
         "marlin_g_idx": marlin_g_idx,
         "marlin_sort_indices": marlin_sort_indices,
         "marlin_rand_perm": marlin_rand_perm,
@@ -125,11 +128,21 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_gemm_fp16",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
-            description="gptq_marlin_gemm",
+            description="gptq_marlin_gemm_fp32",
         ).blocked_autorange(min_run_time=min_run_time))
 
     if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
@@ -183,12 +196,12 @@ def main(args):
                            ) > 0 and is_k_full not in args.limit_k_full:
                         continue
 
-                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+                    for num_bits in MARLIN_SUPPORTED_NUM_BITS:
                         if len(args.limit_num_bits
                                ) > 0 and num_bits not in args.limit_num_bits:
                             continue
 
-                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
                             if len(
                                     args.limit_group_size
                             ) > 0 and group_size not in args.limit_group_size:
diff --git a/csrc/ops.h b/csrc/ops.h
index 9ef1fcb465bf3..f075850248d1c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -93,7 +93,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& g_idx, torch::Tensor& perm,
                                torch::Tensor& workspace, int64_t num_bits,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp);
+                               bool is_k_full, bool has_zp,
+                               bool use_fp32_reduce);
 
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 122c5c16b58ce..36ae2bfafa7c2 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -59,14 +59,16 @@ __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const int* __restrict__ g_idx,        // int32 group indices of shape k
-    int num_groups,  // number of scale groups per output channel
-    int prob_m,      // batch dimension m
-    int prob_n,      // output dimension n
-    int prob_k,      // reduction dimension k
-    int* locks       // extra global storage for barrier synchronization
+    int num_groups,       // number of scale groups per output channel
+    int prob_m,           // batch dimension m
+    int prob_n,           // output dimension n
+    int prob_k,           // reduction dimension k
+    int* locks,           // extra global storage for barrier synchronization
+    bool use_fp32_reduce  // whether to use fp32 global reduce
 ) {}
 
 }  // namespace gptq_marlin
@@ -532,16 +534,18 @@ __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
                                           // (k/groupsize)x(n/pack_factor)
     const int* __restrict__ g_idx,        // int32 group indices of shape k
-    int num_groups,  // number of scale groups per output channel
-    int prob_m,      // batch dimension m
-    int prob_n,      // output dimension n
-    int prob_k,      // reduction dimension k
-    int* locks       // extra global storage for barrier synchronization
+    int num_groups,       // number of scale groups per output channel
+    int prob_m,           // batch dimension m
+    int prob_n,           // output dimension n
+    int prob_k,           // reduction dimension k
+    int* locks,           // extra global storage for barrier synchronization
+    bool use_fp32_reduce  // whether to use fp32 global reduce
 ) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
   // same size, which might involve multiple column "slices" (of width 16 *
@@ -595,6 +599,8 @@ __global__ void Marlin(
   int slice_idx;  // index of threadblock in current slice; numbered bottom to
                   // top
 
+  int par_id = 0;
+
   // We can easily implement parallel problem execution by just remapping
   // indices and advancing global pointers
   if (slice_col_par >= n_tiles) {
@@ -602,6 +608,7 @@ __global__ void Marlin(
     C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
     locks += (slice_col_par / n_tiles) * n_tiles;
     slice_col = slice_col_par % n_tiles;
+    par_id = slice_col_par / n_tiles;
   }
 
   // Compute all information about the current slice which is required for
@@ -632,6 +639,7 @@ __global__ void Marlin(
       C += 16 * thread_m_blocks * prob_n / 8;
       locks += n_tiles;
       slice_col = 0;
+      par_id++;
     }
   };
   init_slice();
@@ -1321,7 +1329,7 @@ __global__ void Marlin(
   // finally have to globally reduce over the results. As the striped
   // partitioning minimizes the number of such reductions and our outputs are
   // usually rather small, we perform this reduction serially in L2 cache.
-  auto global_reduce = [&](bool first = false, bool last = false) {
+  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
     // We are very careful here to reduce directly in the output buffer to
     // maximize L2 cache utilization in this step. To do this, we write out
     // results in FP16 (but still reduce with FP32 compute).
@@ -1382,6 +1390,53 @@ __global__ void Marlin(
     }
   };
 
+  // Globally reduce over threadblocks that compute the same column block.
+  // We use a tmp C buffer to reduce in full fp32 precision.
+  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
+    constexpr int tb_m = thread_m_blocks * 16;
+    constexpr int tb_n = thread_n_blocks * 16;
+
+    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
+
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+
+    int par_offset = c_size * n_tiles * par_id;
+    int slice_offset = c_size * slice_col;
+
+    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int th_size = num_floats * sizeof(float) / 16;
+
+    int c_cur_offset = par_offset + slice_offset;
+
+    if (!is_th_active) {
+      return;
+    }
+
+    if (!first) {
+      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        sh[threadIdx.x] =
+            C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
+
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
+  #pragma unroll
+        for (int f = 0; f < 4; f++) {
+          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
+        }
+      }
+    }
+
+    if (!last) {
+      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
+      }
+    }
+  };
+
   // Write out the reduce final result in the correct layout. We only actually
   // reshuffle matrix fragments in this step, the reduction above is performed
   // in fragment layout.
@@ -1606,7 +1661,11 @@ __global__ void Marlin(
       if (slice_count > 1) {  // only globally reduce if there is more than one
                               // block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
-        global_reduce(slice_idx == 0, last);
+        if (use_fp32_reduce) {
+          global_reduce_fp32(slice_idx == 0, last);
+        } else {
+          global_reduce_fp16(slice_idx == 0, last);
+        }
         barrier_release(&locks[slice_col], last);
       }
       if (last)  // only the last block in a slice actually writes the result
@@ -1661,8 +1720,8 @@ __global__ void Marlin(
              THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
              HAS_ZP, GROUP_BLOCKS>                                             \
           <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                   \
-              A_ptr, B_ptr, C_ptr, s_ptr, zp_ptr, g_idx_ptr, num_groups,       \
-              prob_m, prob_n, prob_k, locks);                                  \
+              A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,        \
+              num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);     \
     }
 
 typedef struct {
@@ -1801,6 +1860,27 @@ bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
   return true;
 }
 
+int determine_reduce_max_m(int prob_m, int max_par) {
+  constexpr int tile_m_size = 16;
+
+  if (prob_m <= tile_m_size) {
+    return tile_m_size;
+
+  } else if (prob_m <= tile_m_size * 2) {
+    return tile_m_size * 2;
+
+  } else if (prob_m <= tile_m_size * 3) {
+    return tile_m_size * 3;
+
+  } else if (prob_m <= tile_m_size * 4) {
+    return tile_m_size * 4;
+
+  } else {
+    int cur_par = min(div_ceil(prob_m, tile_m_size * 4), max_par);
+    return tile_m_size * 4 * cur_par;
+  }
+}
+
 exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
                                       int num_bits, int group_size,
                                       bool has_act_order, bool is_k_full,
@@ -1880,13 +1960,13 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
     __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
 
 template <typename scalar_t>
-void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp,
-                     void* g_idx, void* perm, void* a_tmp, int prob_m,
-                     int prob_n, int prob_k, void* workspace, int num_bits,
-                     bool has_act_order, bool is_k_full, bool has_zp,
-                     int num_groups, int group_size, int dev,
+void marlin_mm_f16i4(const void* A, const void* B, void* C, void* C_tmp,
+                     void* s, void* zp, void* g_idx, void* perm, void* a_tmp,
+                     int prob_m, int prob_n, int prob_k, void* workspace,
+                     int num_bits, bool has_act_order, bool is_k_full,
+                     bool has_zp, int num_groups, int group_size, int dev,
                      cudaStream_t stream, int thread_k, int thread_n, int sms,
-                     int max_par) {
+                     int max_par, bool use_fp32_reduce) {
   TORCH_CHECK(num_bits == 4 || num_bits == 8,
               "num_bits must be 4 or 8. Got = ", num_bits);
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
@@ -1970,6 +2050,7 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp,
   const int4* A_ptr = (const int4*)A;
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
   const int4* s_ptr = (const int4*)s;
   const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
@@ -2049,7 +2130,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& g_idx, torch::Tensor& perm,
                                torch::Tensor& workspace, int64_t num_bits,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp) {
+                               bool is_k_full, bool has_zp,
+                               bool use_fp32_reduce) {
   // Verify num_bits
   TORCH_CHECK(num_bits == 4 || num_bits == 8,
               "num_bits must be 4 or 8. Got = ", num_bits);
@@ -2099,6 +2181,17 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   torch::Tensor c = torch::empty({size_m, size_n}, options);
   torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
 
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  int reduce_max_m = marlin::determine_reduce_max_m(size_m, marlin::max_par);
+  int reduce_n = size_n;
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+  if (!use_fp32_reduce) {
+    reduce_max_m = 0;
+    reduce_n = 0;
+  }
+  torch::Tensor c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32);
+
   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
   // auto -1)
   int thread_k = -1;
@@ -2171,20 +2264,21 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   if (a.scalar_type() == at::ScalarType::Half) {
     marlin::marlin_mm_f16i4<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        b_scales.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
+        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
         workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par);
+        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     marlin::marlin_mm_f16i4<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
-        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
         workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par);
+        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
   } else {
     TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
   }
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 42087fdcce959..bd35ef2eb2552 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -27,6 +27,7 @@
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
+USE_FP32_REDUCE_OPTS = [False, True]
 
 MARLIN_K_CHUNKS = [128]
 MARLIN_N_CHUNKS = [64, 128, 256]
@@ -175,6 +176,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
+@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 def test_gptq_marlin_gemm(
     k_chunk,
     n_chunk,
@@ -183,6 +185,7 @@ def test_gptq_marlin_gemm(
     mnk_factors,
     act_order,
     is_k_full,
+    use_fp32_reduce,
 ):
     m_factor, n_factor, k_factor = mnk_factors
 
@@ -222,8 +225,9 @@ def test_gptq_marlin_gemm(
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
-        is_k_full,
+        is_k_full=is_k_full,
         has_zp=False,
+        use_fp32_reduce=use_fp32_reduce,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
@@ -365,12 +369,14 @@ def test_fp8_marlin_gemm(
 @pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 def test_awq_marlin_gemm(
     k_chunk,
     n_chunk,
     num_bits,
     group_size,
     mnk_factors,
+    use_fp32_reduce,
 ):
     m_factor, n_factor, k_factor = mnk_factors
 
@@ -407,8 +413,9 @@ def test_awq_marlin_gemm(
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
-        is_k_full,
-        has_zp,
+        is_k_full=is_k_full,
+        has_zp=has_zp,
+        use_fp32_reduce=use_fp32_reduce,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0186594656cc1..ad9f01be6ddd4 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -286,12 +286,12 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                      b_scales: torch.Tensor, b_zeros: torch.Tensor,
                      g_idx: torch.Tensor, perm: torch.Tensor,
                      workspace: torch.Tensor, num_bits: int, size_m: int,
-                     size_n: int, size_k: int, is_k_full: bool,
-                     has_zp: bool) -> torch.Tensor:
+                     size_n: int, size_k: int, is_k_full: bool, has_zp: bool,
+                     use_fp32_reduce: bool) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
                                          g_idx, perm, workspace, num_bits,
                                          size_m, size_n, size_k, is_k_full,
-                                         has_zp)
+                                         has_zp, use_fp32_reduce)
 
 
 # fp8 marlin
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 25a7cd7bde653..b789ca20cadb3 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -16,6 +16,11 @@
 MARLIN_SUPPORTED_NUM_BITS = [4, 8]
 MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
+# In case there is a performance issue with Marlin, the variable below can be
+# changed to False, which allows Marlin to perform global reductions in fp16
+# precision (instead of fp32), and therefore, save on some memory movements.
+USE_FP32_REDUCE_DEFAULT = True
+
 
 def _check_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
                             min_capability: Optional[int],
@@ -244,7 +249,8 @@ def apply_gptq_marlin_linear(
         output_size_per_partition: int,
         input_size_per_partition: int,
         is_k_full: bool,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
@@ -260,7 +266,8 @@ def apply_gptq_marlin_linear(
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
                                   is_k_full=is_k_full,
-                                  has_zp=False)
+                                  has_zp=False,
+                                  use_fp32_reduce=use_fp32_reduce)
 
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -279,7 +286,8 @@ def apply_awq_marlin_linear(
         num_bits: int,
         output_size_per_partition: int,
         input_size_per_partition: int,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
@@ -295,7 +303,8 @@ def apply_awq_marlin_linear(
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
                                   is_k_full=True,
-                                  has_zp=True)
+                                  has_zp=True,
+                                  use_fp32_reduce=use_fp32_reduce)
 
     if bias is not None:
         output.add_(bias)  # In-place add

From b1366a953498fde9c5e7ab91915367ebc69008b2 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sat, 27 Jul 2024 18:05:17 -0400
Subject: [PATCH 072/167] Add Nemotron to PP_SUPPORTED_MODELS (#6863)

---
 vllm/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/config.py b/vllm/config.py
index 92fde449b43fd..e7b54e04b00d5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -39,6 +39,7 @@
     "Phi3ForCausalLM",
     "GPT2LMHeadModel",
     "MixtralForCausalLM",
+    "NemotronForCausalLM",
 ]
 
 

From 3eeb148f467e3619e8890b1a5ebe86a173f91bc9 Mon Sep 17 00:00:00 2001
From: Elsa Granger <6374697+zeyugao@users.noreply.github.com>
Date: Sun, 28 Jul 2024 23:13:49 +0800
Subject: [PATCH 073/167] [Misc] Pass cutlass_fp8_supported correctly in
 fbgemm_fp8 (#6871)

---
 .../layers/quantization/fbgemm_fp8.py         | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 5e8d1f1947421..e7c3859967c71 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -9,6 +9,7 @@
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -72,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
 
     def create_weights(
         self,
@@ -139,11 +141,12 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
-        return apply_fp8_linear(input=x,
-                                weight=layer.weight,
-                                weight_scale=layer.weight_scale,
-                                input_scale=None,
-                                input_scale_ub=layer.input_scale_ub,
-                                bias=bias,
-                                cutlass_fp8_supported=True,
-                                use_per_token_if_dynamic=True)
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            input_scale_ub=layer.input_scale_ub,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=True)

From 7cbd9ec7a9bfd4952ad522355b6bbb8e82b54fc9 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 29 Jul 2024 18:16:30 +0800
Subject: [PATCH 074/167] [Model] Initialize support for InternVL2 series
 models (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.rst       |   4 +
 examples/offline_inference_vision_language.py |  15 +
 examples/openai_vision_api_client.py          |   2 +
 requirements-test.txt                         |   1 +
 tests/models/test_internvl.py                 | 201 ++++++++
 vllm/entrypoints/chat_utils.py                |   2 +-
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/intern_vit.py      | 270 ++++++++++
 vllm/model_executor/models/internlm2.py       |  10 +-
 vllm/model_executor/models/internvl.py        | 471 ++++++++++++++++++
 vllm/model_executor/models/qwen2.py           |  10 +-
 vllm/transformers_utils/config.py             |   8 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/internvl.py   |  51 ++
 14 files changed, 1042 insertions(+), 6 deletions(-)
 create mode 100644 tests/models/test_internvl.py
 create mode 100644 vllm/model_executor/models/intern_vit.py
 create mode 100644 vllm/model_executor/models/internvl.py
 create mode 100644 vllm/transformers_utils/configs/internvl.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 83c1b9c8bce86..4fe33e5ab5d80 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -200,6 +200,10 @@ Vision Language Models
     - Fuyu
     - :code:`adept/fuyu-8b` etc.
     - 
+  * - :code:`InternVLChatModel`
+    - InternVL2
+    - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
+    - 
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 04ba1a96314c9..846246a2062a6 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -106,6 +106,20 @@ def run_minicpmv(question):
     return llm, prompt
 
 
+# InternVL
+def run_internvl(question):
+    # Generally, InternVL can use chatml template for conversation
+    TEMPLATE = "<|im_start|>User\n{prompt}<|im_end|>\n<|im_start|>Assistant\n"
+    prompt = f"<image>\n{question}\n"
+    prompt = TEMPLATE.format(prompt=prompt)
+    llm = LLM(
+        model="OpenGVLab/InternVL2-4B",
+        trust_remote_code=True,
+        max_num_seqs=5,
+    )
+    return llm, prompt
+
+
 # BLIP-2
 def run_blip2(question):
 
@@ -125,6 +139,7 @@ def run_blip2(question):
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
     "blip-2": run_blip2,
+    "internvl_chat": run_internvl,
 }
 
 
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index 2082c378e267c..be90394511f89 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -42,6 +42,7 @@
         ],
     }],
     model=model,
+    max_tokens=64,
 )
 
 result = chat_completion_from_url.choices[0].message.content
@@ -78,6 +79,7 @@ def encode_image_base64_from_url(image_url: str) -> str:
         ],
     }],
     model=model,
+    max_tokens=64,
 )
 
 result = chat_completion_from_base64.choices[0].message.content
diff --git a/requirements-test.txt b/requirements-test.txt
index a7604d2e1015e..9b88fcce3e842 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -16,6 +16,7 @@ ray
 sentence-transformers # required for embedding
 sparseml==1.8.0 # required for compressed-tensors
 compressed-tensors==0.4.0 # required for compressed-tensors
+timm # required for internvl test
 
 # Benchmarking
 aiohttp
diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py
new file mode 100644
index 0000000000000..66cb8dda248db
--- /dev/null
+++ b/tests/models/test_internvl.py
@@ -0,0 +1,201 @@
+import types
+from typing import List, Optional, Type
+
+import pytest
+import torch
+from huggingface_hub import snapshot_download
+from PIL.Image import Image
+
+from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
+                                                 IMG_START,
+                                                 image_to_pixel_values)
+from vllm.multimodal.utils import rescale_image_size
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    "cherry_blossom":
+    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+})
+
+# we use snapshot_download to prevent conflicts between
+# dynamic_module and trust_remote_code for hf_runner
+models = [
+    snapshot_download("OpenGVLab/InternVL2-1B"),
+    snapshot_download("OpenGVLab/InternVL2-2B"),
+    # snapshot_download("OpenGVLab/InternVL2-4B"),  # broken
+]
+
+
+class InternVLProcessor:
+    """A simple processor for InternVL2 HF model which misses a processor."""
+
+    def __init__(self, hf_runner: HfRunner):
+        self.num_image_token = hf_runner.model.num_image_token
+        self.tokenizer = hf_runner.tokenizer
+        self.dtype = hf_runner.model.dtype
+
+    def __call__(self, text: str, images: Image, **kwargs):
+        pixel_values = image_to_pixel_values(images).to(self.dtype)
+        num_patches_list = [pixel_values.shape[0]]
+        for num_patches in num_patches_list:
+            context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
+            image_tokens = IMG_START + context_tokens + IMG_END
+            text = text.replace('<image>', image_tokens, 1)
+        prompt = self.tokenizer(text, return_tensors="pt")
+        prompt.update({"pixel_values": pixel_values})
+        return prompt
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
+def generate(
+    self,
+    pixel_values: torch.FloatTensor,
+    input_ids: torch.FloatTensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    **generate_kwargs,
+) -> torch.LongTensor:
+    """Generate method for InternVL2 model without fixed use_cache."""
+    assert self.img_context_token_id is not None
+    vit_embeds = self.extract_feature(pixel_values)
+    input_embeds = self.language_model.get_input_embeddings()(input_ids)
+    B, N, C = input_embeds.shape
+    input_embeds = input_embeds.reshape(B * N, C)
+
+    input_ids = input_ids.reshape(B * N)
+    selected = (input_ids == self.img_context_token_id)
+    assert selected.sum() != 0
+    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+
+    input_embeds = input_embeds.reshape(B, N, C)
+
+    outputs = self.language_model.generate(
+        inputs_embeds=input_embeds,
+        attention_mask=attention_mask,
+        **generate_kwargs,
+    )
+
+    return outputs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+            "<IMG_CONTEXT>")
+        hf_model.model.img_context_token_id = img_context_token_id
+        hf_model.processor = InternVLProcessor(hf_model)
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.language_model.get_output_embeddings()
+        hf_model.model.generate = types.MethodType(generate, hf_model.model)
+        eos_token_id = hf_model.tokenizer.eos_token_id
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
+            for prompts, hf_images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+target_dtype = "half"
+if is_cpu():
+    target_dtype = "bfloat16"
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1f6d77b828459..fbb7f70b55e16 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -107,7 +107,7 @@ def _image_token_str(model_config: ModelConfig,
         return None
     if model_type.startswith("llava"):
         return tokenizer.decode(model_config.hf_config.image_token_index)
-    if model_type == "chameleon":
+    if model_type in ("chameleon", "internvl_chat"):
         return "<image>"
     raise TypeError(f"Unknown model type: {model_type}")
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index fe04c6db5fbc2..94c3cea98be7b 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -37,6 +37,7 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
+    "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "LlavaForConditionalGeneration":
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
new file mode 100644
index 0000000000000..86d0930d80126
--- /dev/null
+++ b/vllm/model_executor/models/intern_vit.py
@@ -0,0 +1,270 @@
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+
+NORM2FN = {
+    'rms_norm': RMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+
+
+class InternVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(in_channels=3,
+                                         out_channels=self.embed_dim,
+                                         kernel_size=self.patch_size,
+                                         stride=self.patch_size)
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim))
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size,
+            self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed,
+                                  size=(H, W),
+                                  mode='bicubic',
+                                  align_corners=False)
+        pos_embed = pos_embed.reshape(1, -1, H * W).permute(0, 2,
+                                                            1).to(target_dtype)
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            target_dtype))  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1,
+                                                   -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height,
+                                width)
+        ],
+                                       dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads '
+                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).')
+
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(self.embed_dim,
+                             3 * self.embed_dim,
+                             bias=config.qkv_bias)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(
+                B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(
+                B_, N_, H_, D_).transpose(1, 2)
+
+        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        x = x.transpose(1, 2).reshape(B, N, C)
+
+        x = self.proj(x)
+        return x
+
+
+class InternMLP(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config, quant_config=quant_config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor *
+                                torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor *
+                                torch.ones(self.embed_dim))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states)) * self.ls1
+
+        hidden_states = hidden_states + self.mlp(
+            self.norm2(hidden_states)) * self.ls2
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config=config, quant_config=quant_config)
+            for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class InternVisionModel(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size,
+                                            old_size // patch_size,
+                                            -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(),
+                                size=new_size // patch_size,
+                                mode='bicubic',
+                                align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim,
+                                                    -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError(
+                'You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        elif pixel_values is not None:
+            if pixel_values.ndim == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(
+                    f'wrong pixel_values size: {pixel_values.shape}')
+
+        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+
+        return encoder_outputs
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 22132f40fc5e6..745fbf99a902d 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -219,14 +219,22 @@ def __init__(
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.tok_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: IntermediateTensors = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.tok_embeddings(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.tok_embeddings(input_ids)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
new file mode 100644
index 0000000000000..f64c78c15f8ee
--- /dev/null
+++ b/vllm/model_executor/models/internvl.py
@@ -0,0 +1,471 @@
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models.intern_vit import InternVisionModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.image import cached_get_tokenizer
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   get_clip_num_patches)
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<IMG_CONTEXT>'
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 3000
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 500
+
+
+class InternVLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+    """
+
+
+# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def calculate_num_blocks(orig_width: int,
+                         orig_height: int,
+                         min_num=1,
+                         max_num=6,
+                         image_size=448):
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
+                        for i in range(1, n + 1) for j in range(1, n + 1)
+                        if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    return blocks, target_width, target_height
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=6,
+                       image_size=448,
+                       use_thumbnail=False):
+    orig_width, orig_height = image.size
+
+    blocks, target_width, target_height = calculate_num_blocks(
+        orig_width, orig_height, min_num, max_num, image_size)
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def image_to_pixel_values(image: Image.Image, input_size=448, max_num=6):
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image,
+                                image_size=input_size,
+                                use_thumbnail=True,
+                                max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+def get_internvl_num_patches(image_size: int, patch_size: int,
+                             downsample_ratio: float):
+    return int(
+        get_clip_num_patches(image_size=image_size, patch_size=patch_size) *
+        (downsample_ratio**2))
+
+
+def get_max_internvl_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+    image_size = vision_config.image_size
+    patch_size = vision_config.patch_size
+    downsample_ratio = hf_config.downsample_ratio
+    num_patches = get_internvl_num_patches(image_size, patch_size,
+                                           downsample_ratio)
+    return num_patches * 7
+
+
+def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        width, height = image_data.size
+        num_blocks, _, _ = calculate_num_blocks(width, height)
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    image_size = vision_config.image_size
+    patch_size = vision_config.patch_size
+    downsample_ratio = hf_config.downsample_ratio
+    num_patches = get_internvl_num_patches(image_size, patch_size,
+                                           downsample_ratio)
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+
+    prompt = llm_inputs["prompt"]
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    if prompt is None:
+        prompt = tokenizer.decode(prompt_token_ids)
+    image_prompt = IMG_START + IMG_CONTEXT * (num_blocks +
+                                              1) * num_patches + IMG_END
+    new_prompt = prompt.replace('<image>', image_prompt, 1)
+    new_prompt_token_ids = tokenizer.encode(new_prompt)
+
+    return LLMInputs(prompt=prompt,
+                     prompt_token_ids=new_prompt_token_ids,
+                     multi_modal_data=multi_modal_data)
+
+
+def input_mapper_for_internvl(ctx: InputContext, data: object):
+    if isinstance(data, Image.Image):
+        data = image_to_pixel_values(data)
+    model_config = ctx.model_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+    image_token_id = tokenizer.encode(IMG_CONTEXT,
+                                      add_special_tokens=False,
+                                      return_tensors="pt")[0]
+
+    return MultiModalInputs({
+        "pixel_values": data,
+        "image_token_id": image_token_id
+    })
+
+
+def dummy_data_for_internvl(ctx: InputContext, seq_len: int):
+
+    image_feature_size = get_max_internvl_image_tokens(ctx)
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+
+    seq_data = dummy_seq_data_for_clip(
+        vision_config,
+        seq_len,
+        image_token_id=tokenizer.encode(IMG_CONTEXT,
+                                        add_special_tokens=False)[0],
+        image_feature_size_override=image_feature_size,
+    )
+    mm_data = dummy_image_for_clip(
+        vision_config,
+        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+    )
+
+    return seq_data, mm_data
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_internvl)
+class InternVLChatModel(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        vision_feature_layer = self.select_layer
+        if vision_feature_layer < 0:
+            num_hidden_layers = config.vision_config.num_hidden_layers \
+                + vision_feature_layer + 1
+        else:
+            num_hidden_layers = vision_feature_layer + 1
+        self.vision_model = InternVisionModel(
+            config.vision_config, num_hidden_layers_override=num_hidden_layers)
+
+        llm_class = ModelRegistry.load_model_cls(
+            config.text_config.architectures[0])
+        self.language_model = llm_class(config.text_config, cache_config,
+                                        quant_config)
+
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      llm_hidden_size), nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size))
+
+        self.img_context_token_id = None
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        vit_embeds = self.vision_model(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        if list(data.shape[1:]) != [2]:
+            raise ValueError(
+                f"The expected image sizes shape is batch dimension plus "
+                f"{[2]}. You supplied {data.shape}.")
+
+        return data
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each batch element "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[InternVLImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_token_id = kwargs.pop("image_token_id", None)
+
+        if pixel_values is None:
+            return None
+
+        self.img_context_token_id = image_token_id[0]
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return InternVLImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is not None:
+            inputs_embeds = self.language_model.model.get_input_embeddings(
+                input_ids)
+            vit_embeds = self.extract_feature(image_input["data"])
+            inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+                                                    vit_embeds,
+                                                    self.img_context_token_id)
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  None,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".w1", 0),
+            (".gate_up_proj", ".w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.text_config.tie_word_embeddings \
+                and "lm_head.weight" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # We only do sharding for language model
+                # and not vision model for now.
+                if "vision_embed_tokens" in name and self.vision_embed_tokens:
+                    continue
+                if weight_name not in name:
+                    continue
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                if "wqkv" in name:
+                    config = self.config.text_config
+                    kv_groups = (config.num_attention_heads //
+                                 config.num_key_value_heads)
+                    head_dim = config.hidden_size // config.num_attention_heads
+                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
+                                                       head_dim,
+                                                       loaded_weight.shape[-1])
+                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
+                                             dim=1)
+                    wq = wq.reshape(-1, wq.shape[-1])
+                    wk = wk.reshape(-1, wk.shape[-1])
+                    wv = wv.reshape(-1, wv.shape[-1])
+                    weight_loader = param.weight_loader
+                    weight_loader(param, wq, 'q')
+                    weight_loader(param, wk, 'k')
+                    weight_loader(param, wv, 'v')
+                    continue
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e9aa4416eded4..3deb3d8840cc4 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -243,14 +243,22 @@ def __init__(
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_tokens(input_ids)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3ba2e01985598..3d13631b9b2b6 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,9 +6,10 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             JAISConfig, MedusaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
-                                             NemotronConfig, RWConfig)
+                                             InternVLChatConfig, JAISConfig,
+                                             MedusaConfig, MLPSpeculatorConfig,
+                                             MPTConfig, NemotronConfig,
+                                             RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -26,6 +27,7 @@
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
+    "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
 }
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 1750950b3c38b..5ccacd4a4c40a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -4,6 +4,7 @@
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
@@ -15,6 +16,7 @@
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
+    "InternVLChatConfig",
     "JAISConfig",
     "MedusaConfig",
     "MLPSpeculatorConfig",
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
new file mode 100644
index 0000000000000..ac2492317aa36
--- /dev/null
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -0,0 +1,51 @@
+# Adapted from
+# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class InternVLChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch

From 766435e660a786933392eb8ef0a873bc38cf0c8b Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 29 Jul 2024 11:42:35 -0400
Subject: [PATCH 075/167] [Kernel] Tuned FP8 Kernels for Ada Lovelace (#6677)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_benchmarks/w8a8_benchmarks.py     |   2 +-
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 520 ++----------------
 .../cutlass_w8a8/scaled_mm_c2x.cuh            | 340 ++++++++++++
 .../scaled_mm_c2x_sm80_dispatch.cuh           | 139 +++++
 .../scaled_mm_c2x_sm89_dispatch.cuh           | 362 ++++++++++++
 tests/kernels/test_cutlass.py                 |   4 +-
 6 files changed, 877 insertions(+), 490 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 234c2c8a1074c..70247e94e63cf 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -13,7 +13,7 @@
 from vllm import _custom_ops as ops
 from vllm.utils import FlexibleArgumentParser
 
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 6ce25c5ac897b..d26c43de522c9 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -1,470 +1,16 @@
 #include <stddef.h>
 #include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-// clang-format will break include orders
-// clang-format off
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/util/device_memory.h"
-
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm_coord.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
 
-#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
-#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
-
-#include "broadcast_load_epilogue_c2x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
+#include "scaled_mm_c2x.cuh"
+#include "scaled_mm_c2x_sm80_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_dispatch.cuh"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
    NVIDIA GPUs with SM versions prior to sm90 (Hopper).
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
 */
 
-namespace {
-
-// Wrappers for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm75_to_sm80 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm80_to_sm89 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm89_to_sm90 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-/*
- * This class provides the common ScaleA and ScaleB descriptors for the
- * ScaledEpilogue and ScaledEpilogueBias classes.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
-
-  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
-      OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
-      OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
-};
-
-/*
- This epilogue function defines a quantized GEMM operation similar to
- torch._scaled_mm.
-
- A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
- per-row. B can be quantized per-tensor or per-column.
- Any combination of per-tensor and per-row or column is supported.
- A and B must have symmetric quantization (zero point == 0).
-
- So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
- scales are applied elementwise with numpy-style broadcasting.
-
- ScaleA and ScaleB define the epilogue functions that apply the scales for
- the A and B operands respectively. These scales may be either per-tensor or
- per row or column.
-*/
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    using ScaleAArgs = typename ScaleA::Arguments;
-    using ScaleBArgs = typename ScaleB::Arguments;
-
-    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-
-    typename EVTCompute0::Arguments evt0_compute_args{b_args};
-
-    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
-    return evt_compute_args;
-  }
-};
-
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
-      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
-
- public:
-  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
-                                                             EVTCompute0, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    using ScaleAArgs = typename ScaleA::Arguments;
-    using ScaleBArgs = typename ScaleB::Arguments;
-    using BiasArgs = typename Bias::Arguments;
-
-    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
-
-    typename EVTCompute0::Arguments evt0_compute_args{b_args};
-
-    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
-                                                    bias_args};
-    return evt_compute_args;
-  }
-};
-
-template <typename Arch, template <typename> typename ArchGuard,
-          typename ElementAB_, typename ElementD_,
-          template <typename, typename> typename Epilogue_, typename TileShape,
-          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
-struct cutlass_2x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using Operator =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
-                                cutlass::arch::OpMultiplyAddSaturate,
-                                cutlass::arch::OpMultiplyAdd>::type;
-
-  using OutputTileThreadMap =
-      cutlass::epilogue::threadblock::OutputTileThreadLayout<
-          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
-          >;
-
-  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
-      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
-      Stride<int64_t, Int<1>, Int<0>>>;
-
-  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
-
-  // clang-format off
-  using RowMajor = typename cutlass::layout::RowMajor;
-  using ColumnMajor = typename cutlass::layout::ColumnMajor;
-  using KernelType =
-    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
-      float, cutlass::layout::RowMajor, 4,
-      ElementAcc, float, cutlass::arch::OpClassTensorOp,
-      Arch,
-      TileShape, WarpShape, InstructionShape,
-      EVTD,
-      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
-      MainLoopStages, Operator,
-      1 /* epilogue stages */
-      >::GemmKernel>;
-  // clang-format on
-
-  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-  cutlass::gemm::GemmCoord problem_size{m, n, k};
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-
-  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
-
-  using Epilogue = typename Gemm::Epilogue;
-  auto evt_args =
-      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
-
-  typename Gemm::EVTD::Arguments epilogue_args{
-      evt_args,
-      d_args,
-  };
-
-  typename Gemm::Op::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
-      problem_size,                                           // problem size
-      1,                                                      // batch count
-      epilogue_args,
-      a_ptr,
-      b_ptr,
-      nullptr,
-      nullptr,
-      0,
-      0,
-      0,
-      0,
-      lda,
-      ldb,
-      ldc,
-      ldc};
-
-  // Launch the CUTLASS GEMM kernel.
-  typename Gemm::Op gemm_op;
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-  cutlass::Status status = gemm_op(args, workspace.get(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
-void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                                  torch::Tensor const& b,
-                                  EpilogueArgs&&... args) {
-  // In some cases, the GPU isn't able to accommodate the
-  // shared memory requirements of the Gemm. In such cases, use
-  // the FallbackGemm instead.
-  static const int max_shared_mem_per_block_opt_in =
-      get_cuda_max_shared_memory_per_block_opt_in(0);
-
-  size_t const gemm_shared_mem_size =
-      sizeof(typename Gemm::KernelType::SharedStorage);
-  size_t const fallback_gemm_shared_mem_size =
-      sizeof(typename FallbackGemm::KernelType::SharedStorage);
-
-  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
-    return cutlass_gemm_caller<Gemm>(out, a, b,
-                                     std::forward<EpilogueArgs>(args)...);
-  } else {
-    TORCH_CHECK(fallback_gemm_shared_mem_size <=
-                max_shared_mem_per_block_opt_in);
-    return cutlass_gemm_caller<FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_default {
-  // This config is used in 2 cases,
-  //  - M in (128, inf)
-  //  - M in (64, 128] and N >= 8192
-  // Shared Memory required by this Gemm - 81920 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M64 {
-  // This config is used in 2 cases,
-  // - M in (32, 64]
-  // - M in (64, 128] and N < 8192
-  // Shared Memory required by this Gemm - 122880 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M32 {
-  // M in (16, 32]
-  // Shared Memory required by this Gemm - 61440 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M16 {
-  // M in [1, 16]
-  // Shared Memory required by this Gemm - 51200 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-}  // namespace
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  using Cutlass2xGemmDefault =
-      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM128BigN =
-      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM128SmallN =
-      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM64 =
-      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM32 =
-      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM16 =
-      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
-
-  // Due to shared memory requirements, some Gemms may fail to run on some
-  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
-  // in such cases.
-  // sm80_config_M16 has the least shared-memory requirement. However,
-  // based on some profiling, we select sm80_config_M32 as a better alternative
-  // performance wise.
-  using FallbackGemm =
-      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
-  if (mp2 <= 16) {
-    // M in [1, 16]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 32) {
-    // M in (16, 32]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 64) {
-    // M in (32, 64]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // M in (64, 128]
-    uint32_t const n = out.size(1);
-    bool const small_n = n < 8192;
-    if (small_n) {
-      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
-                                          FallbackGemm>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    } else {
-      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    }
-  } else {
-    // M in (128, inf)
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
 template <template <typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
@@ -478,14 +24,15 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
   using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+    return vllm::cutlass_gemm_caller<
+        vllm::cutlass_2x_gemm<cutlass::arch::Sm75, vllm::enable_sm75_to_sm80,
+                              int8_t, cutlass::bfloat16_t, Epilogue, TileShape,
+                              WarpShape, InstructionShape, 2>>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
+    return vllm::cutlass_gemm_caller<vllm::cutlass_2x_gemm<
+        cutlass::arch::Sm75, vllm::enable_sm75_to_sm80, int8_t, cutlass::half_t,
         Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
@@ -501,11 +48,11 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
 
@@ -518,11 +65,12 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
+                                            Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -537,11 +85,11 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
 
@@ -558,15 +106,17 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                int8_t, cutlass::bfloat16_t, Epilogue,
+                                TileShape, WarpShape, InstructionShape, 5>>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                int8_t, cutlass::half_t, Epilogue, TileShape,
+                                WarpShape, InstructionShape, 5>>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -574,17 +124,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<
-          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
-                          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
-                          TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
+                                              cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<
-          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
-                          cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
-                          TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
+                                              cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
@@ -600,10 +146,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
new file mode 100644
index 0000000000000..ba620e85117b5
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -0,0 +1,340 @@
+#pragma once
+#include <stddef.h>
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+// clang-format will break include orders
+// clang-format off
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/util/device_memory.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+
+#include "broadcast_load_epilogue_c2x.hpp"
+#include "common.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm {
+
+// Wrappers for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+/*
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
+      OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
+      OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+    using BiasArgs = typename Bias::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
+                                                    bias_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_,
+          template <typename, typename> typename Epilogue_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages,
+          typename FP8MathOperator = cutlass::arch::OpMultiplyAdd>
+struct cutlass_2x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Operator =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
+                                cutlass::arch::OpMultiplyAddSaturate,
+                                FP8MathOperator>::type;
+
+  using OutputTileThreadMap =
+      cutlass::epilogue::threadblock::OutputTileThreadLayout<
+          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
+          >;
+
+  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
+      Stride<int64_t, Int<1>, Int<0>>>;
+
+  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
+
+  // clang-format off
+  using RowMajor = typename cutlass::layout::RowMajor;
+  using ColumnMajor = typename cutlass::layout::ColumnMajor;
+  using KernelType =
+    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
+      float, cutlass::layout::RowMajor, 4,
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
+      TileShape, WarpShape, InstructionShape,
+      EVTD,
+      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
+      MainLoopStages, Operator,
+      1 /* epilogue stages */
+      >::GemmKernel>;
+  // clang-format on
+
+  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  cutlass::gemm::GemmCoord problem_size{m, n, k};
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
+
+  using Epilogue = typename Gemm::Epilogue;
+  auto evt_args =
+      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
+
+  typename Gemm::EVTD::Arguments epilogue_args{
+      evt_args,
+      d_args,
+  };
+
+  typename Gemm::Op::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
+      problem_size,                                           // problem size
+      1,                                                      // batch count
+      epilogue_args,
+      a_ptr,
+      b_ptr,
+      nullptr,
+      nullptr,
+      0,
+      0,
+      0,
+      0,
+      lda,
+      ldb,
+      ldc,
+      ldc};
+
+  // Launch the CUTLASS GEMM kernel.
+  typename Gemm::Op gemm_op;
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+  cutlass::Status status = gemm_op(args, workspace.get(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+inline void fallback_cutlass_gemm_caller(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
new file mode 100644
index 0000000000000..89d101b0ed822
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
@@ -0,0 +1,139 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM80 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_default {
+  // This config is used in 2 cases,
+  //  - M in (128, inf)
+  //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M64 {
+  // This config is used in 2 cases,
+  // - M in (32, 64]
+  // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M32 {
+  // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M16 {
+  // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm80_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128BigN =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128SmallN =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM64 =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM16 =
+      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    uint32_t const n = out.size(1);
+    bool const small_n = n < 8192;
+    if (small_n) {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    // M in (128, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
new file mode 100644
index 0000000000000..c9ed3ed67171e
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
@@ -0,0 +1,362 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+#include "cutlass/float8.h"
+
+/**
+ * This file defines Gemm kernel configurations for SM89 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm89_fallback_gemm {
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5,
+                      FP8MathOperator>;
+};
+
+struct sm89_config_default {
+  // M in (256, inf)
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M256 {
+  // M in (128, 256]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M128 {
+  // M in (64, 128]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M64 {
+  // M in (32, 64]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8196) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M32 {
+  // M in (16, 32]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 128, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M16 {
+  // M in [1, 16]
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+  static const int32_t MainLoopStages = 5;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 24576) {
+      using TileShape = typename cutlass::gemm::GemmShape<16, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return sm89_config_M16::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return sm89_config_M32::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return sm89_config_M64::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return sm89_config_M128::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return sm89_config_M256::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return sm89_config_default::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index d409df34ee5e5..6b72875823b4b 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -106,8 +106,8 @@ def cutlass_int8_gemm_helper(m: int,
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
 
-@pytest.mark.parametrize("m", [512, 222, 100, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
+@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])

From db9e5708a98b7209cf4465a0391139cf8fca7674 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Tue, 30 Jul 2024 00:47:31 +0800
Subject: [PATCH 076/167] [Core] Reduce unnecessary compute when logprobs=None
 (#6532)

---
 tests/samplers/test_logprobs.py       |  39 ++++++-
 vllm/model_executor/layers/sampler.py | 144 +++++++++++++++-----------
 vllm/outputs.py                       |  17 +--
 vllm/sampling_params.py               |  15 +--
 4 files changed, 135 insertions(+), 80 deletions(-)

diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index f7bcd4c855799..c07c71e38233f 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -14,7 +14,7 @@
 @pytest.mark.parametrize("dtype",
                          ["float"])  # needed for comparing logprobs with HF
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
+@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
 @pytest.mark.parametrize("detokenize", [True, False])
 def test_get_prompt_logprobs(
     hf_runner,
@@ -63,7 +63,10 @@ def test_get_prompt_logprobs(
         assert result.outputs[0].logprobs is not None
         assert len(result.outputs[0].logprobs) == max_tokens
         for logprobs in result.outputs[0].logprobs:
-            assert len(logprobs) == num_top_logprobs
+            # If the output token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(logprobs) == num_top_logprobs
+                    or len(logprobs) == num_top_logprobs + 1)
         output_text = result.outputs[0].text
         output_string_from_most_likely_tokens_lst: List[str] = []
         for top_logprobs in result.outputs[0].logprobs:
@@ -135,3 +138,35 @@ def test_max_logprobs():
     bad_sampling_params = SamplingParams(logprobs=2)
     with pytest.raises(ValueError):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
+                       detokenize: bool, example_prompts):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+    max_tokens = 5
+
+    with vllm_runner(
+            model,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
+        results_logprobs_none = vllm_model.model.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 121458f8156a1..60fa3fbb51be6 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,5 +1,6 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
+from math import inf
 from typing import Dict, List, Optional, Tuple
 
 import torch
@@ -774,8 +775,11 @@ def _get_logprobs(
     # The next token ids to get the logprob value from.
     next_token_ids: List[int] = []
     # The largest requested number of logprobs. We find logprobs as many as the
-    # largest num logprobs in this API.
-    largest_num_logprobs = 1
+    # largest num logprobs in this API. If every logprobs is None, it will be
+    # set to -1.
+    largest_num_logprobs = -1
+    # If beam search is enabled.
+    use_beam_search = False
 
     # Select indices to compute logprob from, ranks of token ids, and the top
     # k token ids from logprobs.
@@ -808,6 +812,8 @@ def _get_logprobs(
                 largest_num_logprobs = max(largest_num_logprobs,
                                            sampling_params.logprobs)
 
+            use_beam_search = use_beam_search or sampling_params.use_beam_search
+
         assert len(next_token_ids) == len(query_indices)
 
     if len(query_indices) == 0:
@@ -815,35 +821,40 @@ def _get_logprobs(
         empty_prompt_logprob: Optional[PromptLogprobs] = None
         return [empty_prompt_logprob], [empty_sampled_logprob]
 
-    query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
-    next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device)
-
-    # (num_selected_query_tokens, num_logprobs). Note that query_indices can
-    # contain duplicates if beam search is enabled.
-    selected_logprobs = logprobs[[
-        query_indices_gpu,
-        next_token_ids_gpu,
-    ]]
-    ranks = _get_ranks(
-        logprobs[query_indices_gpu],
-        next_token_ids_gpu,
-    )
-    assert selected_logprobs.shape[0] == ranks.shape[0]
-
-    # Logprobs of topk tokens for a batch of sequence groups.
-    # (num_query_tokens_across_batch).
-    if largest_num_logprobs > 0:
-        top_logprobs, top_token_ids = torch.topk(logprobs,
-                                                 largest_num_logprobs,
-                                                 dim=-1)
-    else:
-        top_logprobs, top_token_ids = None, None
+    selected_logprobs, ranks = None, None
+    top_logprobs, top_token_ids = None, None
+
+    # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can
+    # skip the whole logprob calculation.
+    if largest_num_logprobs >= 0 or use_beam_search:
+        query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
+        next_token_ids_gpu = torch.tensor(next_token_ids,
+                                          device=logprobs.device)
+
+        # (num_selected_query_tokens, num_logprobs). Note that query_indices can
+        # contain duplicates if beam search is enabled.
+        selected_logprobs = logprobs[[
+            query_indices_gpu,
+            next_token_ids_gpu,
+        ]]
+        ranks = _get_ranks(
+            logprobs[query_indices_gpu],
+            next_token_ids_gpu,
+        )
+        assert selected_logprobs.shape[0] == ranks.shape[0]
+
+        # We need to compute top k only if there exists logprobs > 0.
+        if largest_num_logprobs > 0:
+            # Logprobs of topk tokens for a batch of sequence groups.
+            # (num_query_tokens_across_batch).
+            top_logprobs, top_token_ids = torch.topk(logprobs,
+                                                     largest_num_logprobs,
+                                                     dim=-1)
+            top_logprobs = top_logprobs.to('cpu')
+            top_token_ids = top_token_ids.to('cpu')
 
-    selected_logprobs = selected_logprobs.to('cpu')
-    ranks = ranks.to('cpu')
-    if top_logprobs is not None and top_token_ids is not None:
-        top_logprobs = top_logprobs.to('cpu')
-        top_token_ids = top_token_ids.to('cpu')
+        selected_logprobs = selected_logprobs.to('cpu')
+        ranks = ranks.to('cpu')
 
     # Find prompt/sample logprobs.
     prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = []
@@ -940,46 +951,53 @@ def _get_sampled_logprob_if_needed(
 ):
     """Compute the sample logprob if needed."""
     seq_ids = seq_group.seq_ids
-    num_logprobs = seq_group.sampling_params.logprobs or 0
+    num_logprobs = seq_group.sampling_params.logprobs
+    use_beam_search = seq_group.sampling_params.use_beam_search
     sampled_logprobs: SampleLogprobs = []
     next_token_ids, parent_seq_ids = sample_result
 
     if seq_group.do_sample:
         assert len(next_token_ids) > 0
-        # Pre-select items from tensor. tolist() is faster than repetitive
-        # `.item()` calls.
-        selected_logprob_items = selected_logprobs[
-            selected_logprobs_idx:selected_logprobs_idx +
-            len(next_token_ids)].tolist()
-        rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
-                           len(next_token_ids)].tolist()
-        for idx, (next_token_id,
-                  parent_id) in enumerate(zip(next_token_ids, parent_seq_ids)):
-            # Get the logprob of a sampled token.
-            sampled_logprobs_dict = {
-                next_token_id: (selected_logprob_items[idx], rank_items[idx])
-            }
-            # Get top K logprobs.
-            if num_logprobs > 0:
-                top_ids = top_token_ids[top_logprob_idx +
-                                        parent_id, :num_logprobs].tolist()
-                top_probs = top_logprobs[top_logprob_idx +
-                                         parent_id, :num_logprobs].tolist()
-                # Top K is already sorted by rank, so we can use 1 ~
-                # num_logprobs + 1 for rank.
-                top_ranks = range(1, num_logprobs + 1)
-                sampled_logprobs_dict.update({
-                    top_id: (top_prob, rank)
-                    for top_id, top_prob, rank in zip(top_ids, top_probs,
-                                                      top_ranks)
+        if num_logprobs is None and not use_beam_search:
+            for next_token_id in next_token_ids:
+                # Use a dummy logprob
+                sampled_logprobs.append({next_token_id: Logprob(inf)})
+        else:
+            # Pre-select items from tensor. tolist() is faster than repetitive
+            # `.item()` calls.
+            selected_logprob_items = selected_logprobs[
+                selected_logprobs_idx:selected_logprobs_idx +
+                len(next_token_ids)].tolist()
+            rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
+                               len(next_token_ids)].tolist()
+            for idx, (next_token_id, parent_id) in enumerate(
+                    zip(next_token_ids, parent_seq_ids)):
+                # Get the logprob of a sampled token.
+                sampled_logprobs_dict = {
+                    next_token_id:
+                    (selected_logprob_items[idx], rank_items[idx])
+                }
+                if num_logprobs is not None and num_logprobs > 0:
+                    # Get top K logprobs.
+                    top_ids = top_token_ids[top_logprob_idx +
+                                            parent_id, :num_logprobs].tolist()
+                    top_probs = top_logprobs[
+                        top_logprob_idx + parent_id, :num_logprobs].tolist()
+                    # Top K is already sorted by rank, so we can use 1 ~
+                    # num_logprobs + 1 for rank.
+                    top_ranks = range(1, num_logprobs + 1)
+                    sampled_logprobs_dict.update({
+                        top_id: (top_prob, rank)
+                        for top_id, top_prob, rank in zip(
+                            top_ids, top_probs, top_ranks)
+                    })
+
+                sampled_logprobs.append({
+                    token_id: Logprob(*logprob_and_rank)
+                    for token_id, logprob_and_rank in
+                    sampled_logprobs_dict.items()
                 })
 
-            sampled_logprobs.append({
-                token_id: Logprob(*logprob_and_rank)
-                for token_id, logprob_and_rank in
-                sampled_logprobs_dict.items()
-            })
-
         # NOTE: This part of code is not intuitive. `selected_logprobs` include
         # logprobs for the current step, which has len(next_token_ids) tokens
         # per sequence group. `logprobs` includes logprobs from the previous
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 4cb7f06bdb8c7..b1cb1cd07fbb1 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -29,7 +29,7 @@ class CompletionOutput:
     index: int
     text: str
     token_ids: Tuple[int, ...]
-    cumulative_logprob: float
+    cumulative_logprob: Optional[float]
     logprobs: Optional[SampleLogprobs]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
@@ -124,13 +124,14 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
         include_logprobs = seq_group.sampling_params.logprobs is not None
         text_buffer_length = seq_group.sampling_params.output_text_buffer_length
         outputs = [
-            CompletionOutput(seqs.index(seq),
-                             seq.get_output_text_to_return(text_buffer_length),
-                             seq.get_output_token_ids(),
-                             seq.get_cumulative_logprob(),
-                             seq.output_logprobs if include_logprobs else None,
-                             SequenceStatus.get_finished_reason(seq.status),
-                             seq.stop_reason) for seq in top_n_seqs
+            CompletionOutput(
+                seqs.index(seq),
+                seq.get_output_text_to_return(text_buffer_length),
+                seq.get_output_token_ids(),
+                seq.get_cumulative_logprob() if include_logprobs else None,
+                seq.output_logprobs if include_logprobs else None,
+                SequenceStatus.get_finished_reason(seq.status),
+                seq.stop_reason) for seq in top_n_seqs
         ]
 
         # Every sequence in the sequence group should have the same prompt.
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 638c870c04371..2598325439ebf 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -92,11 +92,12 @@ class SamplingParams:
         min_tokens: Minimum number of tokens to generate per output sequence
             before EOS or stop_token_ids can be generated
         logprobs: Number of log probabilities to return per output token.
-            Note that the implementation follows the OpenAI API: The return
-            result includes the log probabilities on the `logprobs` most likely
-            tokens, as well the chosen tokens. The API will always return the
-            log probability of the sampled token, so there  may be up to
-            `logprobs+1` elements in the response.
+            When set to None, no probability is returned. If set to a non-None
+            value, the result includes the log probabilities of the specified
+            number of most likely tokens, as well as the chosen tokens.
+            Note that the implementation follows the OpenAI API: The API will
+            always return the log probability of the sampled token, so there
+            may be up to `logprobs+1` elements in the response.
         prompt_logprobs: Number of log probabilities to return per prompt token.
         detokenize: Whether to detokenize the output. Defaults to True.
         skip_special_tokens: Whether to skip special tokens in the output.
@@ -168,8 +169,8 @@ def __init__(
         self.ignore_eos = ignore_eos
         self.max_tokens = max_tokens
         self.min_tokens = min_tokens
-        self.logprobs = logprobs
-        self.prompt_logprobs = prompt_logprobs
+        self.logprobs = 1 if logprobs is True else logprobs
+        self.prompt_logprobs = 1 if prompt_logprobs is True else prompt_logprobs
         # NOTE: This parameter is only exposed at the engine level for now.
         # It is not exposed in the OpenAI API server, as the OpenAI API does
         # not support returning only a list of token IDs.

From 60d1c6e584203b07df5020529d678e69baad84a3 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 12:59:02 -0400
Subject: [PATCH 077/167] [Kernel] Fix deprecation function warnings squeezellm
 quant_cuda_kernel (#6901)

---
 csrc/quantization/squeezellm/quant_cuda_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
index 40baac6108695..714907428a1ab 100644
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -197,13 +197,13 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
 #ifndef USE_ROCM
-      (half2*)vec.data<at::Half>(),
+      (half2*)vec.data_ptr<at::Half>(),
 #else
       (__half2*)vec.data_ptr<at::Half>(),
 #endif
       mat.data_ptr<int>(),
 #ifndef USE_ROCM
-      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data<at::Half>(),
+      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
 #else
       (float2*)mul.data_ptr<float>(),
       (__half*)lookup_table.data_ptr<at::Half>(),

From 7f8d612d24c66e9b5f8c0aa6cb562e853e9523a0 Mon Sep 17 00:00:00 2001
From: Earthwalker <48991073+etwk@users.noreply.github.com>
Date: Tue, 30 Jul 2024 03:42:21 +0800
Subject: [PATCH 078/167] [TPU] Support tensor parallelism in async llm engine
 (#6891)

---
 Dockerfile.tpu                  |  3 +++
 vllm/engine/async_llm_engine.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 4fc14d6bd186c..adebb8ab5adca 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -12,6 +12,9 @@ RUN pip install "numpy<2"
 RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 
+# Fix FastAPI dependence
+RUN pip install "starlette<0.38.0"
+
 # Build vLLM.
 COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 93cc319f11c42..d3f9a0ab00f10 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -407,8 +407,14 @@ def _get_executor_cls(
             from vllm.executor.neuron_executor import NeuronExecutorAsync
             executor_class = NeuronExecutorAsync
         elif engine_config.device_config.device_type == "tpu":
-            from vllm.executor.tpu_executor import TPUExecutorAsync
-            executor_class = TPUExecutorAsync
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync
+                executor_class = RayTPUExecutorAsync
+            else:
+                assert distributed_executor_backend is None
+                from vllm.executor.tpu_executor import TPUExecutorAsync
+                executor_class = TPUExecutorAsync
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync

From 9a7e2d053405da076ff0975660e82c4bc08a62be Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Mon, 29 Jul 2024 23:51:27 +0200
Subject: [PATCH 079/167] [Bugfix] Allow vllm to still work if triton is not
 installed. (#6786)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 requirements-cpu.txt                          |  1 -
 requirements-openvino.txt                     |  2 --
 requirements-tpu.txt                          |  1 -
 tests/kernels/test_sampler.py                 |  7 +++---
 vllm/attention/ops/paged_attn.py              |  5 ++++-
 .../layers/fused_moe/__init__.py              | 22 +++++++++++++------
 vllm/model_executor/layers/ops/sample.py      | 14 +-----------
 .../model_executor/layers/quantization/fp8.py |  4 ++--
 vllm/model_executor/layers/sampler.py         |  6 ++++-
 vllm/model_executor/sampling_metadata.py      |  2 +-
 vllm/triton_utils/__init__.py                 | 14 +++++++-----
 vllm/triton_utils/importing.py                | 11 ++++++++++
 vllm/triton_utils/sample.py                   | 13 +++++++++++
 13 files changed, 65 insertions(+), 37 deletions(-)
 create mode 100644 vllm/triton_utils/importing.py
 create mode 100644 vllm/triton_utils/sample.py

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index a8ce104d83290..2dcd86274a2a8 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,4 +4,3 @@
 # Dependencies for x86_64 CPUs
 torch == 2.4.0; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index e32c76fb0db21..fabac3c7bbaf9 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -5,5 +5,3 @@
 torch >= 2.1.2
 openvino ~= 2024.3.0.dev
 optimum-intel[openvino] >= 1.18.1
-
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index c2140fbffec9f..5eb27b39eb623 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -5,4 +5,3 @@
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
 ray
-triton  # To avoid import errors
diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index e28f809309ec5..3c53f7decc6eb 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -5,11 +5,12 @@
 import triton
 import triton.language as tl
 
-from vllm.model_executor.layers.ops.sample import (
-    MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
-    sample)
+from vllm.model_executor.layers.ops.sample import (_uniform_to_exponential,
+                                                   sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
+from vllm.triton_utils.sample import (MAX_TRITON_N_COLS,
+                                      get_num_triton_sampler_splits)
 
 SINGLE_SPLIT_VOCAB_SIZE = 32000  # llama/mistral/mixtral vocab size
 MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 0f6d2f2d1ab3f..e88963ade16c9 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -4,7 +4,10 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index db837231c6ace..3e0767c7d2665 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,14 +1,22 @@
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
+from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
-    "fused_moe",
-    "fused_topk",
-    "fused_experts",
-    "get_config_file_name",
-    "grouped_topk",
     "FusedMoE",
     "FusedMoEMethodBase",
 ]
+
+if HAS_TRITON:
+
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk)
+
+    __all__ += [
+        "fused_moe",
+        "fused_topk",
+        "fused_experts",
+        "get_config_file_name",
+        "grouped_topk",
+    ]
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index d08ae6064aa2a..bdb577da3172f 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -1,4 +1,3 @@
-import math
 from typing import Optional, Tuple
 
 import torch
@@ -6,21 +5,10 @@
 import triton.language as tl
 
 from vllm.model_executor.layers.ops.rand import seeded_uniform
+from vllm.triton_utils.sample import get_num_triton_sampler_splits
 
 _EPS = 1e-6
 
-# This is a hardcoded limit in Triton (max block size).
-MAX_TRITON_N_COLS = 131072
-
-
-def get_num_triton_sampler_splits(n_cols: int) -> int:
-    """Get the number of splits to use for Triton sampling.
-
-    Triton has a limit on the number of columns it can handle, so we need to
-    split the tensor and call the kernel multiple times if it's too large.
-    """
-    return math.ceil(n_cols / MAX_TRITON_N_COLS)
-
 
 def _multi_split_sample(
     probs: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6649b317ca838..c829cb836ee4c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -6,8 +6,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
-                                                  fused_moe)
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -404,6 +403,7 @@ def apply(self,
               num_expert_group: Optional[int] = None,
               topk_group: Optional[int] = None) -> torch.Tensor:
 
+        from vllm.model_executor.layers.fused_moe import fused_moe
         return fused_moe(x,
                          layer.w13_weight,
                          layer.w2_weight,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 60fa3fbb51be6..6632b1c434582 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -6,7 +6,11 @@
 import torch
 import torch.nn as nn
 
-from vllm.model_executor.layers.ops.sample import sample as sample_triton
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.model_executor.layers.ops.sample import sample as sample_triton
+
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 27b37a9d53470..1caf9aa01d8c8 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -5,9 +5,9 @@
 
 import torch
 
-from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
+from vllm.triton_utils.sample import get_num_triton_sampler_splits
 from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
                         make_tensor_with_pad, maybe_expand_dim)
 
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 09843e5d1f30b..568185383aa5c 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,6 +1,10 @@
-from vllm.triton_utils.custom_cache_manager import (
-    maybe_set_triton_cache_manager)
+from vllm.triton_utils.importing import HAS_TRITON
 
-__all__ = [
-    "maybe_set_triton_cache_manager",
-]
+__all__ = ["HAS_TRITON"]
+
+if HAS_TRITON:
+
+    from vllm.triton_utils.custom_cache_manager import (
+        maybe_set_triton_cache_manager)
+
+    __all__ += ["maybe_set_triton_cache_manager"]
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
new file mode 100644
index 0000000000000..3455036586a93
--- /dev/null
+++ b/vllm/triton_utils/importing.py
@@ -0,0 +1,11 @@
+from importlib.util import find_spec
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+HAS_TRITON = find_spec("triton") is not None
+
+if not HAS_TRITON:
+    logger.info("Triton not installed; certain GPU-related functions"
+                " will be not be available.")
diff --git a/vllm/triton_utils/sample.py b/vllm/triton_utils/sample.py
new file mode 100644
index 0000000000000..401e4d28a3c99
--- /dev/null
+++ b/vllm/triton_utils/sample.py
@@ -0,0 +1,13 @@
+import math
+
+# This is a hardcoded limit in Triton (max block size).
+MAX_TRITON_N_COLS = 131072
+
+
+def get_num_triton_sampler_splits(n_cols: int) -> int:
+    """Get the number of splits to use for Triton sampling.
+
+    Triton has a limit on the number of columns it can handle, so we need to
+    split the tensor and call the kernel multiple times if it's too large.
+    """
+    return math.ceil(n_cols / MAX_TRITON_N_COLS)

From 9f69d8245a695918cb80c218a83c42e9fd409687 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Mon, 29 Jul 2024 16:37:27 -0700
Subject: [PATCH 080/167] [Frontend] New `allowed_token_ids` decoding request
 parameter (#6753)

---
 tests/entrypoints/openai/test_completion.py   | 22 ++++++
 vllm/entrypoints/openai/logits_processors.py  | 74 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 60 ++++-----------
 vllm/entrypoints/openai/serving_chat.py       |  2 +-
 vllm/entrypoints/openai/serving_completion.py |  2 +-
 5 files changed, 114 insertions(+), 46 deletions(-)
 create mode 100644 vllm/entrypoints/openai/logits_processors.py

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index fe00640c0021e..50add84087a95 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -541,6 +541,28 @@ async def test_logits_bias(client: openai.AsyncOpenAI):
     assert first_response != completion.choices[0].text
 
 
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
new file mode 100644
index 0000000000000..31eb5aa628c52
--- /dev/null
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -0,0 +1,74 @@
+from functools import lru_cache
+from typing import Dict, FrozenSet, Iterable, List, Optional, Union
+
+import torch
+from transformers import PreTrainedTokenizer
+
+from vllm.sampling_params import LogitsProcessor
+
+
+class AllowedTokenIdsLogitsProcessor:
+    """Logits processor for constraining generated tokens to a
+    specific set of token ids."""
+
+    def __init__(self, allowed_ids: Iterable[int]):
+        self.allowed_ids: Optional[List[int]] = list(allowed_ids)
+        self.mask: Optional[torch.Tensor] = None
+
+    def __call__(self, token_ids: List[int],
+                 logits: torch.Tensor) -> torch.Tensor:
+        if self.mask is None:
+            self.mask = torch.ones((logits.shape[-1], ),
+                                   dtype=torch.bool,
+                                   device=logits.device)
+            self.mask[self.allowed_ids] = False
+            self.allowed_ids = None
+        logits.masked_fill_(self.mask, float("-inf"))
+        return logits
+
+
+@lru_cache(maxsize=32)
+def _get_allowed_token_ids_logits_processor(
+    allowed_token_ids: FrozenSet[int],
+    vocab_size: int,
+) -> LogitsProcessor:
+    if not allowed_token_ids:
+        raise ValueError("Empty allowed_token_ids provided")
+    if not all(0 <= tid < vocab_size for tid in allowed_token_ids):
+        raise ValueError("allowed_token_ids contains "
+                         "out-of-vocab token id")
+    return AllowedTokenIdsLogitsProcessor(allowed_token_ids)
+
+
+def get_logits_processors(
+        logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]],
+        allowed_token_ids: Optional[List[int]],
+        tokenizer: PreTrainedTokenizer) -> List[LogitsProcessor]:
+    logits_processors = []
+    if logit_bias:
+        try:
+            # Convert token_id to integer
+            # Clamp the bias between -100 and 100 per OpenAI API spec
+            clamped_logit_bias: Dict[int, float] = {
+                int(token_id): min(100.0, max(-100.0, bias))
+                for token_id, bias in logit_bias.items()
+            }
+        except ValueError as exc:
+            raise ValueError(
+                "Found token_id in logit_bias that is not "
+                "an integer or string representing an integer") from exc
+
+        def logit_bias_logits_processor(token_ids: List[int],
+                                        logits: torch.Tensor) -> torch.Tensor:
+            for token_id, bias in clamped_logit_bias.items():
+                logits[token_id] += bias
+            return logits
+
+        logits_processors.append(logit_bias_logits_processor)
+
+    if allowed_token_ids is not None:
+        logits_processors.append(
+            _get_allowed_token_ids_logits_processor(
+                frozenset(allowed_token_ids), tokenizer.vocab_size))
+
+    return logits_processors
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index c024bbc07c069..205860aa8e722 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -5,9 +5,11 @@
 
 import torch
 from pydantic import BaseModel, ConfigDict, Field, model_validator
+from transformers import PreTrainedTokenizer
 from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
@@ -213,30 +215,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_sampling_params(self) -> SamplingParams:
+    def to_sampling_params(self,
+                           tokenizer: PreTrainedTokenizer) -> SamplingParams:
         # We now allow logprobs being true without top_logrobs.
 
-        logits_processors = None
-        if self.logit_bias:
-            logit_bias: Dict[int, float] = {}
-            try:
-                for token_id, bias in self.logit_bias.items():
-                    # Convert token_id to integer before we add to LLMEngine
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    logit_bias[int(token_id)] = min(100, max(-100, bias))
-            except ValueError as exc:
-                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
-                                 f"but token_id must be an integer or string "
-                                 f"representing an integer") from exc
-
-            def logit_bias_logits_processor(
-                    token_ids: List[int],
-                    logits: torch.Tensor) -> torch.Tensor:
-                for token_id, bias in logit_bias.items():
-                    logits[token_id] += bias
-                return logits
-
-            logits_processors = [logit_bias_logits_processor]
+        logits_processors = get_logits_processors(
+            logit_bias=self.logit_bias,
+            allowed_token_ids=None,
+            tokenizer=tokenizer,
+        )
 
         return SamplingParams(
             n=self.n,
@@ -358,6 +345,7 @@ class CompletionRequest(OpenAIBaseModel):
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    allowed_token_ids: Optional[List[int]] = None
     # doc: end-completion-sampling-params
 
     # doc: begin-completion-extra-params
@@ -407,30 +395,14 @@ class CompletionRequest(OpenAIBaseModel):
 
     # doc: end-completion-extra-params
 
-    def to_sampling_params(self):
+    def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        logits_processors = None
-        if self.logit_bias:
-            logit_bias: Dict[int, float] = {}
-            try:
-                for token_id, bias in self.logit_bias.items():
-                    # Convert token_id to integer
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    logit_bias[int(token_id)] = min(100, max(-100, bias))
-            except ValueError as exc:
-                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
-                                 f"but token_id must be an integer or string "
-                                 f"representing an integer") from exc
-
-            def logit_bias_logits_processor(
-                    token_ids: List[int],
-                    logits: torch.Tensor) -> torch.Tensor:
-                for token_id, bias in logit_bias.items():
-                    logits[token_id] += bias
-                return logits
-
-            logits_processors = [logit_bias_logits_processor]
+        logits_processors = get_logits_processors(
+            logit_bias=self.logit_bias,
+            allowed_token_ids=self.allowed_token_ids,
+            tokenizer=tokenizer,
+        )
 
         return SamplingParams(
             n=self.n,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 012f70e661100..01843930bf11d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -134,7 +134,7 @@ async def create_chat_completion(
 
         request_id = f"chat-{random_uuid()}"
         try:
-            sampling_params = request.to_sampling_params()
+            sampling_params = request.to_sampling_params(tokenizer)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 73e420141813e..8548352791680 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -95,7 +95,7 @@ async def create_completion(self, request: CompletionRequest,
 
             tokenizer = await self.engine.get_tokenizer(lora_request)
 
-            sampling_params = request.to_sampling_params()
+            sampling_params = request.to_sampling_params(tokenizer)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend

From aae6d36f7ebec2476f7a29ec72c91d7424bb66de Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 20:01:17 -0400
Subject: [PATCH 081/167] [Kernel] Remove unused variables in
 awq/gemm_kernels.cu (#6908)

---
 csrc/quantization/awq/gemm_kernels.cu | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
index 6d6da5f3d8746..9da724a1b43c3 100644
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -17,14 +17,6 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 namespace vllm {
 namespace awq {
 
-// Pack two half values.
-static inline __device__ __host__ unsigned __pack_half2(const half x,
-                                                        const half y) {
-  unsigned v0 = *((unsigned short*)&x);
-  unsigned v1 = *((unsigned short*)&y);
-  return (v1 << 16) | v0;
-}
-
 template <int N>
 __global__ void __launch_bounds__(64)
     gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters,
@@ -42,11 +34,7 @@ __global__ void __launch_bounds__(64)
   __shared__ half A_shared[16 * (32 + 8)];
   __shared__ half B_shared[32 * (N + 8)];
 
-  __shared__ half scaling_factors_shared[N];
-  __shared__ half zeros_shared[N];
-
   int j_factors1 = ((OC + N - 1) / N);
-  int blockIdx_x = 0;
   int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
   int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
 
@@ -60,7 +48,6 @@ __global__ void __launch_bounds__(64)
 
   static constexpr int row_stride_warp = 32 * 8 / 32;
   static constexpr int row_stride = 2 * 32 * 8 / N;
-  bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < N;
   // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
   bool ld_A_flag =
       (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp +
@@ -145,11 +132,7 @@ __global__ void __launch_bounds__(64)
       uint32_t B_loaded =
           *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
       uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
-      // uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N /
-      // 8)) * 8);
 
-      // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x
-      // % (cta_N / 8)) * 8);
       // - zero and * scale
       // TODO (Haotian): can save 4 assembly instructions if sormulate as deq =
       // q * scale - zero * scale.
@@ -367,17 +350,11 @@ __global__ void __launch_bounds__(64)
 __global__ void __launch_bounds__(64)
     dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors,
                        int* __restrict__ zeros, half* __restrict__ C, int G) {
-  int j_factors1 = 4;
-  int row_stride2 = 4;
-  int split_k_iters = 1;
   static constexpr uint32_t ZERO = 0x0;
   half B_shared[32 * (128 + 8)];
 
   half* B_shared_ptr2 = B_shared;
 
-  half B_shared_warp[32];
-  int OC = 512;
-
   int N = blockDim.x * gridDim.x;  // 2
   int col = (blockIdx.x * blockDim.x + threadIdx.x);
   int row = blockIdx.y * blockDim.y + threadIdx.y;

From 4fbf4aa128c5f5fee62d520dea9d3dfd10f33cdb Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 29 Jul 2024 17:03:45 -0700
Subject: [PATCH 082/167] [ci] GHA workflow to remove ready label upon
 "/notready" comment (#6921)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .../remove_label_not_ready_comment.yml        | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 .github/workflows/remove_label_not_ready_comment.yml

diff --git a/.github/workflows/remove_label_not_ready_comment.yml b/.github/workflows/remove_label_not_ready_comment.yml
new file mode 100644
index 0000000000000..d1da7726eaee3
--- /dev/null
+++ b/.github/workflows/remove_label_not_ready_comment.yml
@@ -0,0 +1,23 @@
+name: Remove ready Label on notready Comment
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  add-ready-label:
+    runs-on: ubuntu-latest
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
+    steps:
+        -   name: Remove ready label
+            uses: actions/github-script@v5
+            with:
+                script: |
+                    github.rest.issues.removeLabel({
+                        owner: context.repo.owner,
+                        repo: context.repo.repo,
+                        issue_number: context.issue.number,
+                        name: 'ready'
+                    })
+            env:
+                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 61a97c32f64641738d2cc623708f28046768224e Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 21:26:07 -0400
Subject: [PATCH 083/167] [Kernel] Fix marlin divide-by-zero warnings (#6904)

---
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 61 ++++++++++---------
 .../marlin/dense/marlin_cuda_kernel.cu        | 18 ++++--
 .../marlin/sparse/marlin_24_cuda_kernel.cu    | 18 ++++--
 3 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 36ae2bfafa7c2..26cc248e6ac5d 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1128,44 +1128,47 @@ __global__ void Marlin(
   };
 
   auto fetch_zp_to_registers = [&](int k, int full_pipe) {
-    if constexpr (!has_zp) {
-      return;
-    }
+    if constexpr (has_zp) {
+      // This code does not handle group_blocks == 0,
+      // which signifies act_order.
+      // has_zp implies AWQ, which doesn't have act_order,
+      static_assert(group_blocks != 0);
 
-    int pipe = full_pipe % stages;
+      int pipe = full_pipe % stages;
 
-    if constexpr (group_blocks == -1) {
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
-      }
+      if constexpr (group_blocks == -1) {
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+        }
 
-    } else if constexpr (group_blocks >= thread_k_blocks) {
-      int4* sh_zp_stage =
-          sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                 (pipe / (group_blocks / thread_k_blocks)));
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] =
-            (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
-      }
-    } else {
-      int warp_id = threadIdx.x / 32;
-      int n_warps = thread_n_blocks / 4;
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        int4* sh_zp_stage =
+            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      } else {
+        int warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
 
-      int warp_row = warp_id / n_warps;
+        int warp_row = warp_id / n_warps;
 
-      int cur_k = warp_row * 16;
-      cur_k += k_iter_size * (k % b_sh_wr_iters);
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
 
-      int k_blocks = cur_k / 16;
-      int cur_group_id = k_blocks / group_blocks;
+        int k_blocks = cur_k / 16;
+        int cur_group_id = k_blocks / group_blocks;
 
-      int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-      sh_zp_stage += cur_group_id * zp_sh_stride;
+        sh_zp_stage += cur_group_id * zp_sh_stride;
 
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] =
-            (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
       }
     }
   };
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index 37339b84ae25b..efbcc182a3ae4 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -452,10 +452,15 @@ __global__ void Marlin(
         B_ptr[i] += b_gl_rd_delta_o;
       }
       // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
+      if constexpr (group_blocks != -1) {
+        // This assumes group_blocks >= thread_k_blocks
+        // and would need to be modified to support smaller groups.
+        static_assert(group_blocks >= thread_k_blocks);
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
       }
     }
     // Insert a fence even when we are winding down the pipeline to ensure that
@@ -480,7 +485,10 @@ __global__ void Marlin(
     // however, this does not seem to be a significant bottleneck, while some
     // theoretically better attempts have lead to bad instruction ordering by
     // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
+    if constexpr (group_blocks != -1) {
+      // This assumes group_blocks >= thread_k_blocks
+      // and would need to be modified to support smaller groups.
+      static_assert(group_blocks >= thread_k_blocks);
       int4* sh_s_stage =
           sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index b5effc3055441..3c50f1786bc68 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -404,10 +404,15 @@ __global__ void Marlin_24(
         meta_ptr[i] += m_gl_rd_delta_o;
       }
       // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
+      if constexpr (group_blocks != -1) {
+        // This assumes group_blocks >= thread_k_blocks
+        // and would need to be modified to support smaller groups.
+        static_assert(group_blocks >= thread_k_blocks);
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
       }
     }
     // Insert a fence even when we are winding down the pipeline to ensure that
@@ -432,7 +437,10 @@ __global__ void Marlin_24(
     // however, this does not seem to be a significant bottleneck, while some
     // theoretically better attempts have lead to bad instruction ordering by
     // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
+    if constexpr (group_blocks != -1) {
+      // This assumes group_blocks >= thread_k_blocks
+      // and would need to be modified to support smaller groups.
+      static_assert(group_blocks >= thread_k_blocks);
       int4* sh_s_stage =
           sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));

From af647fb8b3ea9d910f7d1bc104af8986d048a8e2 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 29 Jul 2024 22:24:58 -0400
Subject: [PATCH 084/167] [Kernel] Tuned int8 kernels for Ada Lovelace (#6848)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_w8a8/scaled_mm_c2x.cu             |  27 +-
 ...uh => scaled_mm_c2x_sm89_fp8_dispatch.cuh} |  54 +--
 .../scaled_mm_c2x_sm89_int8_dispatch.cuh      | 353 ++++++++++++++++++
 tests/kernels/test_cutlass.py                 |   4 +-
 4 files changed, 395 insertions(+), 43 deletions(-)
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c2x_sm89_dispatch.cuh => scaled_mm_c2x_sm89_fp8_dispatch.cuh} (89%)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index d26c43de522c9..aac4900f933a7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -4,7 +4,8 @@
 
 #include "scaled_mm_c2x.cuh"
 #include "scaled_mm_c2x_sm80_dispatch.cuh"
-#include "scaled_mm_c2x_sm89_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
@@ -98,25 +99,17 @@ template <template <typename, typename> typename Epilogue,
 void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
                                      torch::Tensor const& b,
                                      EpilogueArgs&&... epilogue_args) {
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
-                                int8_t, cutlass::bfloat16_t, Epilogue,
-                                TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                                   Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
-                                int8_t, cutlass::half_t, Epilogue, TileShape,
-                                WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t,
+                                                   Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -124,13 +117,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
-                                              cutlass::bfloat16_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_fp8_dispatch<
+          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
-                                              cutlass::half_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                                  cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
similarity index 89%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
index c9ed3ed67171e..4e82c99c3af31 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
@@ -4,7 +4,7 @@
 #include "cutlass/float8.h"
 
 /**
- * This file defines Gemm kernel configurations for SM89 based on the Gemm
+ * This file defines Gemm kernel configurations for SM89 (FP8) based on the Gemm
  * shape.
  */
 
@@ -12,7 +12,7 @@ namespace vllm {
 
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
-struct sm89_fallback_gemm {
+struct sm89_fp8_fallback_gemm {
   // Shared Memory required by this Gemm - 61440 bytes
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using TileShape = typename cutlass::gemm::GemmShape<64, 128, 64>;
@@ -25,7 +25,7 @@ struct sm89_fallback_gemm {
                       FP8MathOperator>;
 };
 
-struct sm89_config_default {
+struct sm89_fp8_config_default {
   // M in (256, inf)
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -40,7 +40,8 @@ struct sm89_config_default {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -74,7 +75,7 @@ struct sm89_config_default {
   }
 };
 
-struct sm89_config_M256 {
+struct sm89_fp8_config_M256 {
   // M in (128, 256]
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -89,7 +90,8 @@ struct sm89_config_M256 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -114,7 +116,7 @@ struct sm89_config_M256 {
   }
 };
 
-struct sm89_config_M128 {
+struct sm89_fp8_config_M128 {
   // M in (64, 128]
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -129,7 +131,8 @@ struct sm89_config_M128 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -163,7 +166,7 @@ struct sm89_config_M128 {
   }
 };
 
-struct sm89_config_M64 {
+struct sm89_fp8_config_M64 {
   // M in (32, 64]
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
 
@@ -176,7 +179,8 @@ struct sm89_config_M64 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -215,7 +219,7 @@ struct sm89_config_M64 {
   }
 };
 
-struct sm89_config_M32 {
+struct sm89_fp8_config_M32 {
   // M in (16, 32]
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
   using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
@@ -229,7 +233,8 @@ struct sm89_config_M32 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -265,7 +270,7 @@ struct sm89_config_M32 {
   }
 };
 
-struct sm89_config_M16 {
+struct sm89_fp8_config_M16 {
   // M in [1, 16]
   using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -281,7 +286,8 @@ struct sm89_config_M16 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -320,10 +326,10 @@ struct sm89_config_M16 {
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue,
           typename... EpilogueArgs>
-inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
-                                       torch::Tensor const& a,
-                                       torch::Tensor const& b,
-                                       EpilogueArgs&&... args) {
+inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
@@ -334,27 +340,27 @@ inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
 
   if (mp2 <= 16) {
     // M in [1, 16]
-    return sm89_config_M16::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M16::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 32) {
     // M in (16, 32]
-    return sm89_config_M32::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M32::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 64) {
     // M in (32, 64]
-    return sm89_config_M64::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M64::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // M in (64, 128]
-    return sm89_config_M128::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M128::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 256) {
     // M in (128, 256]
-    return sm89_config_M256::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M256::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else {
     // M in (256, inf)
-    return sm89_config_default::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_default::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
new file mode 100644
index 0000000000000..95723b31ca3ce
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
@@ -0,0 +1,353 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM89 (int8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm89_int8_fallback_gemm {
+  // Shared mem requirement : 61440
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  static int32_t const MainLoopStages = 5;
+
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+struct sm89_int8_config_default {
+  // M in (256, inf)
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M256 {
+  // M in (128, 256]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M128 {
+  // M in (64, 128]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M64 {
+  // M in (32, 64]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M32 {
+  // M in (16, 32]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<32, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M16 {
+  // M in [1, 16]
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<16, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<16, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return sm89_int8_config_M16::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return sm89_int8_config_M32::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return sm89_int8_config_M64::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return sm89_int8_config_M128::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return sm89_int8_config_M256::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return sm89_int8_config_default::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 6b72875823b4b..4ed9db37f60ab 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -119,8 +119,8 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
     cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024])
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])

From 6e063ea35b6fe3b6a9b87aae589725493849a632 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 30 Jul 2024 02:06:29 -0700
Subject: [PATCH 085/167] [TPU] Fix greedy decoding (#6933)

---
 vllm/worker/tpu_model_runner.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 1692094af8c41..cf4cc5535ba5b 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -28,7 +28,9 @@
 
 logger = init_logger(__name__)
 
-_PAD_SLOT_ID = -1  # NOTE(woosuk): In PyTorch XLA, index -1 is ignored.
+# Here we utilize the behavior that out-of-bound index is ignored.
+# FIXME(woosuk): Find a more reliable way to prevent possible bugs.
+_PAD_SLOT_ID = 1_000_000_000
 # FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
 _ENABLE_TOP_P = False
 # FIXME(woosuk): A temporary hack to support `n > 1`.
@@ -414,10 +416,7 @@ def _prepare_sample(
         best_of = []
         for seq_group_metadata in seq_group_metadata_list:
             sampling_params = seq_group_metadata.sampling_params
-            # NOTE(woosuk): Here we mimic argmax sampling by applying a very
-            # low temperature. This is not accurate.
-            t.append(sampling_params.temperature
-                     if sampling_params.temperature >= 1e-5 else 1e-5)
+            t.append(sampling_params.temperature)
             if sampling_params.top_p != 1 and not _ENABLE_TOP_P:
                 raise NotImplementedError(
                     "Top-p sampling is currently disabled for the TPU backend "
@@ -678,13 +677,23 @@ def forward(
         hidden_states = hidden_states.flatten(0, 1)
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
 
-        logits = logits / t.unsqueeze(dim=1)
+        # Argmax sampling.
+        argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+        argmax_token_ids = argmax_token_ids.repeat(1, num_samples)
+
+        # Zero temperature means greedy decoding. Avoid division by zero.
+        nonzero_t = torch.where(t != 0, t, 1.0)
+        logits = logits / nonzero_t.unsqueeze(dim=1)
         if _ENABLE_TOP_P:
             logits = _apply_top_p(logits, p.unsqueeze(dim=1))
+
+        # Random sampling.
         probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
-        next_token_ids = torch.multinomial(probs,
-                                           num_samples,
-                                           replacement=True)
+        sampled_token_ids = torch.multinomial(probs,
+                                              num_samples,
+                                              replacement=True)
+        next_token_ids = torch.where(t != 0, sampled_token_ids,
+                                     argmax_token_ids)
         return next_token_ids
 
 

From c66c7f86aca956014d9ec6cc7a3e6001037e4655 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 30 Jul 2024 02:20:57 -0700
Subject: [PATCH 086/167] [Bugfix] Fix PaliGemma MMP (#6930)

---
 vllm/model_executor/models/paligemma.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 8a2bacbd96b67..2af48b6bc190f 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -9,7 +9,6 @@
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -133,12 +132,10 @@ class PaliGemmaMultiModalProjector(nn.Module):
     def __init__(self, vision_hidden_size: int, projection_dim: int):
         super().__init__()
 
-        self.linear = ColumnParallelLinear(vision_hidden_size,
-                                           projection_dim,
-                                           bias=True)
+        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        hidden_states, _ = self.linear(image_features)
+        hidden_states = self.linear(image_features)
         return hidden_states
 
 

From f05840368335aa9c8184239d3c8bd986e44692f7 Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Wed, 31 Jul 2024 00:14:03 +0800
Subject: [PATCH 087/167] [Doc] Super tiny fix doc typo (#6949)

---
 vllm/engine/arg_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bad5be4917216..2737b50927f6b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -632,9 +632,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--preemption-mode',
             type=str,
             default=None,
-            help='If \'recompute\', the engine performs preemption by block '
-            'swapping; If \'swap\', the engine performs preemption by block '
-            'swapping.')
+            help='If \'recompute\', the engine performs preemption by '
+            'recomputing; If \'swap\', the engine performs preemption by '
+            'block swapping.')
 
         parser.add_argument(
             "--served-model-name",

From 5cf9254a9cfd1611e5c2fcd1b5011b4bdb56947f Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 30 Jul 2024 10:40:08 -0700
Subject: [PATCH 088/167] [BugFix] Fix use of per-request seed with pipeline
 parallel (#6698)

---
 tests/samplers/test_rejection_sampler.py      | 23 ++---
 tests/samplers/test_sampler.py                |  5 +-
 tests/spec_decode/e2e/test_mlp_correctness.py | 54 ++++++++++-
 tests/spec_decode/e2e/test_seed.py            |  2 +-
 tests/spec_decode/test_batch_expansion.py     |  1 +
 tests/utils.py                                | 31 ++++++
 vllm/core/scheduler.py                        |  1 -
 .../layers/rejection_sampler.py               | 95 ++++++++-----------
 .../layers/spec_decode_base_sampler.py        |  4 +-
 vllm/model_executor/sampling_metadata.py      | 20 ++--
 vllm/sequence.py                              | 12 ---
 vllm/spec_decode/batch_expansion.py           | 37 +++++---
 vllm/spec_decode/medusa_worker.py             |  4 +-
 vllm/spec_decode/mlp_speculator_worker.py     |  4 +-
 vllm/spec_decode/ngram_worker.py              |  3 +-
 vllm/spec_decode/spec_decode_worker.py        | 25 +++--
 vllm/worker/cpu_model_runner.py               |  3 +-
 vllm/worker/model_runner.py                   | 14 ++-
 vllm/worker/model_runner_base.py              | 15 +++
 vllm/worker/neuron_model_runner.py            |  3 +-
 vllm/worker/xpu_model_runner.py               |  3 +-
 21 files changed, 222 insertions(+), 137 deletions(-)

diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index b6330a5e5f7c5..8f6c292620c20 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -150,10 +150,9 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     high=vocab_size,
                                     size=(batch_size, k),
                                     dtype=torch.int64)
-    generators = [None] * batch_size
 
     rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                      draft_token_ids, generators)
+                      draft_token_ids)
 
 
 @pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
@@ -185,14 +184,13 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
 
     results = []
     for _ in range(n_rep):
-        generators = [
-            torch.Generator(
-                device=device).manual_seed(i) if seeded_mask[i] else None
-            for i in range(batch_size)
-        ]
+        seeded_seqs = {
+            i: torch.Generator(device=device).manual_seed(i)
+            for i in range(batch_size) if seeded_mask[i]
+        }
         results.append(
             rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                              draft_token_ids, generators))
+                              draft_token_ids, seeded_seqs))
 
     for i in range(batch_size):
         if seeded_mask[i]:
@@ -242,11 +240,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
         raise AssertionError()
 
     oob_token_ids[0][0] = rogue_token_id
-    generators = [None] * batch_size
 
     with pytest.raises(AssertionError):
         rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                          draft_token_ids, generators)
+                          draft_token_ids)
 
 
 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@@ -417,15 +414,11 @@ def _estimate_rejection_sampling_pdf(
                                       dtype=torch.int64,
                                       device="cuda").repeat(num_samples, 1)
 
-        # unseeded
-        generators = [None]
-
         # Get output tokens via rejection sampling.
         output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
                                                   bonus_token_ids.to("cuda"),
                                                   draft_probs.to("cuda"),
-                                                  draft_token_ids.to("cuda"),
-                                                  generators)
+                                                  draft_token_ids.to("cuda"))
 
         # Remove bonus tokens
         output_token_ids = output_token_ids[:, :-1].flatten()
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 9572588ce6e53..bf062e4a5c09d 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -510,13 +510,16 @@ def test_sampler_mixed(seed: int, device: str):
             ))
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
+    generators: Dict[str, torch.Generator] = {}
+
     def test_sampling():
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list,
             seq_lens,
             query_lens=seq_lens,
             device=device,
-            pin_memory=is_pin_memory_available())
+            pin_memory=is_pin_memory_available(),
+            generators=generators)
         sampler_output = sampler(logits=fake_logits,
                                  sampling_metadata=sampling_metadata)
 
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index e310941afacf3..20f50888dab55 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -21,7 +21,8 @@
 
 import pytest
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import (run_equality_correctness_test,
+                       run_greedy_equality_correctness_test)
 
 # main model
 MAIN_MODEL = "JackFram/llama-160m"
@@ -77,6 +78,57 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+
+        # Speculative model
+        "speculative_model": SPEC_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
+@pytest.mark.parametrize("output_len", [64])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("temperature", [0.1, 1.0])
+@pytest.mark.parametrize("seed", [None])
+def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int,
+                                    temperature: float):
+    """Verify seeded runs produce the same output."""
+    run_equality_correctness_test(baseline_llm_generator,
+                                  test_llm_generator,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  temperature=temperature,
+                                  seeded=True,
+                                  force_output_len=True)
+
+    # Ensure this same test does fail if we _don't_ include per-request seeds
+    with pytest.raises(AssertionError):
+        run_equality_correctness_test(baseline_llm_generator,
+                                      test_llm_generator,
+                                      batch_size,
+                                      max_output_len=output_len,
+                                      temperature=temperature,
+                                      seeded=False,
+                                      force_output_len=True)
+
+
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index 394a53f03ed46..f84c346c1d315 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -29,7 +29,7 @@
     "output_len",
     [
         # Use smaller output len for fast test.
-        10,
+        20,
     ])
 @pytest.mark.parametrize("seed", [None])
 def test_seeded_consistency(baseline_llm_generator, test_llm_generator,
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index c350a2c55396e..0d6aaa449d856 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -86,6 +86,7 @@ def test_create_single_target_seq_group_metadata(k: int):
         input_seq_id,
         target_seq_id,
         token_ids,
+        input_seq_group_metadata.sampling_params,
     )
 
     assert output.request_id == input_seq_group_metadata.request_id
diff --git a/tests/utils.py b/tests/utils.py
index bf36d96108d8c..1086591464d43 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -178,6 +178,37 @@ def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
                 "usage": completion.usage,
             })
 
+            # test seeded random sampling
+            completion = client.completions.create(model=model,
+                                                   prompt=prompt,
+                                                   max_tokens=5,
+                                                   seed=33,
+                                                   temperature=1.0)
+
+            results.append({
+                "test": "seeded_sampling",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test seeded random sampling with multiple prompts
+            completion = client.completions.create(model=model,
+                                                   prompt=[prompt, prompt],
+                                                   max_tokens=5,
+                                                   seed=33,
+                                                   temperature=1.0)
+
+            results.append({
+                "test":
+                "seeded_sampling",
+                "text": [choice.text for choice in completion.choices],
+                "finish_reason":
+                [choice.finish_reason for choice in completion.choices],
+                "usage":
+                completion.usage,
+            })
+
             # test simple list
             batch = client.completions.create(
                 model=model,
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 6e59c5e0f74f3..5b7b569c3e08d 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1029,7 +1029,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 token_chunk_size=token_chunk_size,
                 lora_request=seq_group.lora_request,
                 computed_block_nums=common_computed_block_nums,
-                state=seq_group.state,
                 # `multi_modal_data` will only be present for the 1st comm
                 # between engine and worker.
                 # the subsequent comms can still use delta, but
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index b4994083c797b..533b436344410 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.jit
@@ -36,7 +36,7 @@ def forward(
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
-        generators: List[Optional[torch.Generator]],
+        seeded_seqs: Optional[Dict[int, torch.Generator]] = None,
     ) -> torch.Tensor:
         """Sample token ids using rejection sampling. This accepts or rejects
         tokens proposed by the draft model using the probability of each token
@@ -66,6 +66,9 @@ def forward(
                 probabilities.
             shape = [batch_size, num_speculative_tokens]
 
+            seeded_seqs: Dict of batch row index to torch generator, for
+                sequences using seeded generation.
+
         Returns:
             output_token_ids: The token ids sampled via rejection sampling,
                 or -1 if unable to sample a token because the previous token
@@ -83,7 +86,7 @@ def forward(
                 target_probs,
                 draft_probs,
                 draft_token_ids,
-                generators,
+                seeded_seqs,
             ))
 
         output_token_ids = self._create_output(
@@ -100,7 +103,7 @@ def _batch_modified_rejection_sampling(
         target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
         draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
         draft_token_ids: torch.Tensor,  # [batch_size, k]
-        generators: List[Optional[torch.Generator]],
+        seeded_seqs: Optional[Dict[int, torch.Generator]],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Perform modified rejection sampling on each sequence.
 
@@ -117,23 +120,17 @@ def _batch_modified_rejection_sampling(
 
         # shape [batch_size, k]
         accepted = self._get_accepted(target_probs, draft_probs,
-                                      draft_token_ids, generators)
+                                      draft_token_ids, seeded_seqs)
 
         recovered_probs = self._get_recovered_probs(
             target_probs, draft_probs).reshape(batch_size * k, vocab_size)
 
-        seed_indices, non_seed_indices = self._split_batch_by_seeded(
-            generators, k=k)
-
         # NOTE: the recovered_probs are overwritten by this method.
         recovered_token_ids = _multinomial(
             recovered_probs,
             num_samples=1,
             k=k,
-            generators=generators,
-            seed_indices=seed_indices,
-            # this arg is unused when None but torch.jit requires a list
-            non_seed_indices=non_seed_indices or [],
+            seeded_seqs=seeded_seqs or {},
         ).reshape(batch_size, k)
 
         return accepted, recovered_token_ids
@@ -143,7 +140,7 @@ def _get_accepted(
         target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
         draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
         draft_token_ids: torch.Tensor,  # [batch_size, k]
-        generators: List[Optional[torch.Generator]],
+        seeded_seqs: Optional[Dict[int, torch.Generator]],
     ) -> torch.Tensor:
         r"""Create bool matrix over the proposed draft tokens. If
         True, then a token can be accepted, else it should be
@@ -178,24 +175,26 @@ def _get_accepted(
         selected_target_probs = target_probs[batch_indices, probs_indicies,
                                              draft_token_ids]
 
-        seed_indices, non_seed_indices = self._split_batch_by_seeded(
-            generators)
-
-        if len(seed_indices) == 0:
+        if not seeded_seqs:
             uniform_rand = torch.rand_like(selected_target_probs)
         else:
             uniform_rand = torch.empty_like(selected_target_probs)
 
-            for idx in seed_indices:
-                uniform_rand[idx, :] = torch.rand(1,
-                                                  k,
-                                                  dtype=self.probs_dtype,
-                                                  device=target_probs.device,
-                                                  generator=generators[idx])
-
-            if non_seed_indices:
-                uniform_rand[non_seed_indices, :] = torch.rand(
-                    len(non_seed_indices),
+            non_seeded_indices = []
+            for idx in range(batch_size):
+                generator = seeded_seqs.get(idx)
+                if generator is None:
+                    non_seeded_indices.append(idx)
+                else:
+                    uniform_rand[idx, :] = torch.rand(
+                        1,
+                        k,
+                        dtype=self.probs_dtype,
+                        device=target_probs.device,
+                        generator=generator)
+            if non_seeded_indices:
+                uniform_rand[non_seeded_indices, :] = torch.rand(
+                    len(non_seeded_indices),
                     k,
                     dtype=self.probs_dtype,
                     device=target_probs.device)
@@ -272,27 +271,6 @@ def _smallest_positive_value(self) -> float:
         """
         return torch.finfo(self.probs_dtype).tiny
 
-    # partition batch into indices for which a generator is provided
-    # and indicies for which no generator is provided
-    @staticmethod
-    def _split_batch_by_seeded(
-        generators: List[Optional[torch.Generator]],
-        k: int = 1,
-    ) -> Tuple[List[int], Optional[List[int]]]:
-
-        if all(generator is None for generator in generators):
-            seed_indices: List[int] = []
-            non_seed_indices: Optional[List[int]] = None
-        else:
-            seed_indices, non_seed_indices = [], []
-            for i, generator in enumerate(generators):
-                if generator is None:
-                    non_seed_indices.extend(range(k * i, k * (i + 1)))
-                else:
-                    seed_indices.extend(range(k * i, k * (i + 1)))
-
-        return seed_indices, non_seed_indices
-
 
 # torch.multinomial forces a GPU<->CPU sync.
 # Therefore, we use an optimized implementation instead that skips the sync.
@@ -304,9 +282,7 @@ def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
     k: int,
-    generators: List[Optional[torch.Generator]],
-    seed_indices: List[int],
-    non_seed_indices: List[int],
+    seeded_seqs: Dict[int, torch.Generator],
 ) -> torch.Tensor:
 
     if num_samples > 1:
@@ -315,13 +291,20 @@ def _multinomial(
         probs = probs[:, None, :].expand(probs.shape[0], num_samples,
                                          probs.shape[1]).contiguous().view(
                                              -1, probs.shape[1])
-
     q = torch.empty_like(probs)
-    if len(seed_indices) == 0:
+    if not seeded_seqs:
         q.exponential_(1.0)
     else:
-        q[non_seed_indices].exponential_(1.0)
-        for idx in seed_indices:
-            q[idx].exponential_(1.0, generator=generators[idx // k])
+        non_seeded_indices: List[int] = []
+        start = 0
+        for idx in range(len(q) // k):
+            end = start + k
+            generator = seeded_seqs.get(idx)
+            if generator is None:
+                non_seeded_indices.extend(list(range(start, end)))
+            else:
+                q[start:end].exponential_(1.0, generator=generator)
+            start = end
+        q[non_seeded_indices].exponential_(1.0)
 
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 08191da49d52f..3091e639727b0 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import List, Optional
+from typing import Dict, Optional
 
 import torch
 import torch.jit
@@ -237,6 +237,6 @@ def forward(
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
-        generators: List[Optional[torch.Generator]],
+        seeded_seqs: Optional[Dict[int, torch.Generator]] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 1caf9aa01d8c8..59cfec9ec8934 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -118,6 +118,7 @@ def prepare(
         query_lens: Optional[List[int]],
         device: str,
         pin_memory: bool,
+        generators: Optional[Dict[str, torch.Generator]] = None,
     ) -> "SamplingMetadata":
         (
             seq_groups,
@@ -125,7 +126,7 @@ def prepare(
             categorized_sample_indices,
             num_prompts,
         ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
-                                device)
+                                device, generators)
         selected_token_indices = async_tensor_h2d(selected_token_indices,
                                                   dtype=torch.long,
                                                   target_device=device,
@@ -160,6 +161,7 @@ def _prepare_seq_groups(
     seq_lens: List[int],
     query_lens: Optional[List[int]],
     device: str,
+    generators: Optional[Dict[str, torch.Generator]] = None,
 ) -> Tuple[List[SequenceGroupToSample], List[int], Dict[
         SamplingType, List[Tuple[int, int]]], int]:
     """Prepare sequence groups and indices for sampling.
@@ -170,8 +172,10 @@ def _prepare_seq_groups(
             Index of prompt len should match with seq_group_metadata_list.
         query_lens: A list of query lengths. Prompt lens include the length
             of entire prompt tokens, and it could be shorter.
-        device: A device to use for random number generator,
+        device: A device to use for random number generators,
             `SequenceGroupToSample.generator`.
+        generators: A store of per-request random number generators used
+            for seeded requests.
 
     Returns:
         seq_groups: A list of sequence group to sample.
@@ -217,8 +221,10 @@ def _prepare_seq_groups(
 
         if seq_group_metadata.is_prompt:
             if sampling_params.seed is not None:
-                seq_group_metadata.state.generator = torch.Generator(
-                    device=device).manual_seed(sampling_params.seed)
+                generator = torch.Generator(device=device).manual_seed(
+                    sampling_params.seed)
+                if generators is not None:
+                    generators[seq_group_metadata.request_id] = generator
 
             num_prompts += 1
             num_prefill_sample = len(seq_ids)
@@ -235,6 +241,9 @@ def _prepare_seq_groups(
             prompt_logprob_len = 0
             sample_len = len(seq_ids) if do_sample else 0
 
+            if sampling_params.seed is not None and generators is not None:
+                generator = generators.get(seq_group_metadata.request_id)
+
         # Update indices to select from the model output.
         """
         This blocks computes selected_token_indices which is used in the
@@ -279,9 +288,6 @@ def sample(logits):
             logit_idx += sample_len
             sample_idx += sample_len
 
-        if sampling_params.seed is not None:
-            generator = seq_group_metadata.state.generator
-
         seq_groups.append(
             SequenceGroupToSample(
                 seq_ids=seq_ids,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 72821ecea0f47..ab50cfdfd29a5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -411,14 +411,6 @@ def __repr__(self) -> str:
                 f"num_blocks={self.n_blocks}, ")
 
 
-@dataclass
-class SequenceGroupState:
-    """Mutable state tied to a specific sequence group"""
-
-    # torch.Generator used in seeded sampling
-    generator: Optional = None  # type: ignore
-
-
 class SequenceGroup:
     """A group of sequences that are generated from the same prompt.
 
@@ -461,7 +453,6 @@ def __init__(
                                       time_in_queue=None)
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
-        self.state = SequenceGroupState()
         self.embeddings = embeddings
         self.pooling_params = pooling_params
         self.prompt_adapter_request = prompt_adapter_request
@@ -648,7 +639,6 @@ class SequenceGroupMetadata:
         lora_request: LoRA request.
         computed_block_nums: The block numbers that are already computed,
             used in prefix caching.
-        state: Internal state tied to this sequence group.
         multi_modal_data: Multi modal data.
         encoder_seq_data: Optional sequence data for encoder prompt
                           (SequenceGroup.encoder_seq). Should be None 
@@ -674,7 +664,6 @@ def __init__(
         token_chunk_size: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
-        state: Optional[SequenceGroupState] = None,
         multi_modal_data: Optional["MultiModalDataDict"] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
@@ -690,7 +679,6 @@ def __init__(
         self.prompt_adapter_request = prompt_adapter_request
         self.computed_block_nums = computed_block_nums
         self.multi_modal_data = multi_modal_data
-        self.state = SequenceGroupState() if state is None else state
         self.encoder_seq_data = encoder_seq_data
         self.cross_block_table = cross_block_table
         self._token_chunk_size = token_chunk_size
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 41f0aebf3c01b..45eaeb51c5c0f 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -3,9 +3,9 @@
 
 import torch
 
+from vllm import SamplingParams
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata, SequenceGroupState,
-                           get_all_seq_ids)
+                           SequenceGroupMetadata, get_all_seq_ids)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch,
@@ -16,6 +16,8 @@
 TargetSeqId = int
 TokenId = int
 
+DEFAULT_SIMPLE_SAMPLING_PARAMS = SamplingParams()
+
 
 class BatchExpansionTop1Scorer(SpeculativeScorer):
     """Implements a speculative scorer that uses batch expansion to get
@@ -247,24 +249,39 @@ def _create_target_seq_group_metadata(
         token_ids_to_score = self._get_token_ids_to_score(
             proposal_token_ids[batch_index])
 
+        # Use simpler sampling parameters apart from for final token
+        # (in particular don't do seeded sampling) since those sampled tokens
+        # aren't used.
+        # We don't replace the sampling_params in the greedy case because
+        # this also controls whether the probs get modified in the sampler
+        # (see use of _modify_greedy_probs_inplace there).
+        sampling_params = input_seq_group_metadata.sampling_params
+        non_bonus_sampling_params = DEFAULT_SIMPLE_SAMPLING_PARAMS \
+            if sampling_params.temperature else sampling_params
+
         target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        for token_ids in token_ids_to_score:
+        last_index = len(token_ids_to_score) - 1
+        for i, token_ids in enumerate(token_ids_to_score):
+            target_sampling_params = sampling_params if i == last_index \
+                else non_bonus_sampling_params
             target_seq_group_metadata_list.append(
                 self._create_single_target_seq_group_metadata(
                     input_seq_group_metadata,
                     input_seq_id,
                     next(target_seq_ids_iter),
                     token_ids,
+                    sampling_params=target_sampling_params,
                 ))
 
         return target_seq_group_metadata_list
 
+    @staticmethod
     def _create_single_target_seq_group_metadata(
-        self,
         seq_group_metadata: SequenceGroupMetadata,
         seq_id: SeqId,
         target_seq_id: TargetSeqId,
         token_ids: List[TokenId],
+        sampling_params: SamplingParams,
     ) -> SequenceGroupMetadata:
         """Create a single target SequenceGroupMetadata.
 
@@ -293,26 +310,16 @@ def _create_single_target_seq_group_metadata(
         for data in new_seq_data_dict.values():
             data.update_num_computed_tokens(data.get_len() - 1)
 
-        if (seq_group_metadata.state is not None
-                and seq_group_metadata.state.generator is not None):
-            generator = torch.Generator(
-                device=seq_group_metadata.state.generator.device)
-            generator.set_state(seq_group_metadata.state.generator.get_state())
-            state = SequenceGroupState(generator=generator)
-        else:
-            state = None
-
         return SequenceGroupMetadata(
             request_id=seq_group_metadata.request_id,
             is_prompt=seq_group_metadata.is_prompt,
             seq_data=new_seq_data_dict,
-            sampling_params=seq_group_metadata.sampling_params,
+            sampling_params=sampling_params,
             block_tables={
                 target_seq_id: seq_group_metadata.block_tables[seq_id],
             },
             lora_request=None,
             token_chunk_size=1,
-            state=state,
         )
 
     def _split_scoring_output(
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 041ce41e91d05..4b82f7bf92bab 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -57,9 +57,11 @@ def sampler_output(
         seq_lens, query_lens = self._prepare_input_tensors(
             seq_group_metadata_list)
 
+        generators = self.model_runner.get_generators(
+            execute_model_req.finished_requests_ids)
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list, seq_lens, query_lens, self.device,
-            self.model_runner.pin_memory)
+            self.model_runner.pin_memory, generators)
 
         model_outputs = self.model_runner.model.generate_proposals(
             previous_hidden_states=execute_model_req.previous_hidden_states.
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index 308573348d443..76e444387816f 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -38,9 +38,11 @@ def sampler_output(
         (input_tokens, seq_lens,
          query_lens) = self._prepare_input_tensors(seq_group_metadata_list)
 
+        generators = self.model_runner.get_generators(
+            execute_model_req.finished_requests_ids)
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list, seq_lens, query_lens, self.device,
-            self.model_runner.pin_memory)
+            self.model_runner.pin_memory, generators)
 
         model_outputs = self.model_runner.model.generate_proposals(
             input_ids=input_tokens,
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index a21222fec269b..806480b5c892f 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -7,10 +7,9 @@
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
 
 
-class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
+class NGramWorker(NonLLMProposerWorkerBase):
     """NGramWorker provides a light drafter without need for model.
 
     Current NGramWorker only implements prompt lookup decoding,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 98960b88f719f..ad8c0cee0b5b6 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -213,6 +213,9 @@ def __init__(
         """
         self.proposer_worker = proposer_worker
         self.scorer_worker = scorer_worker
+        scorer_runner = getattr(self.scorer_worker, "model_runner", None)
+        self.generators = scorer_runner.get_generators(
+        ) if scorer_runner else None
         self.disable_by_batch_size = disable_by_batch_size or float("inf")
         self.spec_decode_sampler = spec_decode_sampler
         self._allow_zero_draft_token_step = allow_zero_draft_token_step
@@ -591,20 +594,14 @@ def _verify_tokens(
         proposal_token_ids = proposals.proposal_token_ids[spec_indices]
 
         # Sampler arguments
-        sampler_extra_kwargs = {}
-        if isinstance(self.spec_decode_sampler,
-                      SpecDecodeStochasticBaseSampler):
-
-            # Get sequence group state
-            generators = []
-            for seq_group_metadata in seq_group_metadata_list:
-                if (seq_group_metadata.state is not None
-                        and seq_group_metadata.state.generator is not None):
-                    generators.append(seq_group_metadata.state.generator)
-                else:
-                    generators.append(None)
-
-            sampler_extra_kwargs["generators"] = generators
+        sampler_extra_kwargs: Dict[str, Any] = {}
+        if self.generators and isinstance(self.spec_decode_sampler,
+                                          SpecDecodeStochasticBaseSampler):
+            sampler_extra_kwargs["seeded_seqs"] = {
+                idx: self.generators[sgm.request_id]
+                for idx, sgm in enumerate(seq_group_metadata_list)
+                if sgm.sampling_params.seed is not None
+            }
 
         accepted_token_ids = self.spec_decode_sampler(
             target_probs=proposal_verifier_probs,
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 71763c08ec45f..c1dee444da512 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -337,7 +337,8 @@ def prepare_model_input(
             # just use seq_lens instead.
             seq_lens,
             self.device,
-            pin_memory=False)
+            pin_memory=False,
+            generators=self.get_generators(finished_requests_ids))
         return CPUModelInput(
             input_tokens=input_tokens,
             input_positions=input_positions,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 86d26b4a84c36..4010c45e10267 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1264,11 +1264,15 @@ def prepare_model_input(
         """
         model_input = self._prepare_model_input_tensors(
             seq_group_metadata_list, finished_requests_ids)
-        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
-                                                     model_input.seq_lens,
-                                                     model_input.query_lens,
-                                                     self.device,
-                                                     self.pin_memory)
+        if get_pp_group().is_last_rank:
+            # Sampling metadata is only required for the final pp group
+            generators = self.get_generators(finished_requests_ids)
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list, model_input.seq_lens,
+                model_input.query_lens, self.device, self.pin_memory,
+                generators)
+        else:
+            sampling_metadata = None
         is_prompt = (seq_group_metadata_list[0].is_prompt
                      if seq_group_metadata_list else None)
         return dataclasses.replace(model_input,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 5fb97025af5c0..46ac16b504bf4 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -139,6 +139,9 @@ class ModelRunnerBase(ABC, Generic[T]):
     ModelRunnerInputBase subclass.
     """
 
+    # Map of request_id -> generator used for seeded random sampling
+    generators: Dict[str, torch.Generator] = {}
+
     @abstractmethod
     def make_model_input_from_broadcasted_tensor_dict(
         self,
@@ -176,3 +179,15 @@ def execute_model(
         Execute the model on the given input.
         """
         raise NotImplementedError
+
+    def get_generators(self, finished_request_ids: Optional[List[str]] = None):
+        """
+        Return dict of per-request generators used for random sampling.
+        """
+
+        # Clean up generators from completed requests
+        if finished_request_ids:
+            for request_id in finished_request_ids:
+                self.generators.pop(request_id, None)
+
+        return self.generators
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 651319ab14548..243e2ece56fe5 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -219,7 +219,8 @@ def prepare_model_input(
             # just use seq_lens instead.
             seq_lens,
             self.device,
-            self.pin_memory)
+            self.pin_memory,
+            generators=self.get_generators(finished_requests_ids))
 
         return ModelInputForNeuron(input_tokens=input_tokens,
                                    input_positions=input_positions,
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 2f0ca42316e13..98462f0f7f38e 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -246,7 +246,8 @@ def prepare_model_input(
                 # just use seq_lens instead.
                 seq_lens,
                 self.device,
-                pin_memory=False)
+                pin_memory=False,
+                generators=self.get_generators(finished_requests_ids))
             # Broadcast the metadata.
             metadata_dict = {
                 "input_tokens": input_tokens,

From cbbc904470668b9420e71595edeef76d673a2d59 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 30 Jul 2024 13:50:42 -0400
Subject: [PATCH 089/167] [Kernel] Squash a few more warnings (#6914)

---
 csrc/attention/attention_kernels.cu               | 4 ++--
 csrc/quantization/aqlm/gemm_kernels.cu            | 2 --
 csrc/quantization/fp8/amd/quant_utils.cuh         | 2 ++
 csrc/quantization/fp8/nvidia/quant_utils.cuh      | 2 ++
 csrc/quantization/squeezellm/quant_cuda_kernel.cu | 3 ++-
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 875570a1e894f..bcd170411e7cb 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -706,7 +706,7 @@ void paged_attention_v1_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
@@ -865,7 +865,7 @@ void paged_attention_v2_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
index 8fb9856800867..22da5e4f08a18 100644
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -273,8 +273,6 @@ __global__ void Code2x8Dequant(
   }
   __syncthreads();
 
-  float res = 0;
-
   int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
   while (iters--) {
     if (pred && a_gl_rd < a_gl_end) {
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index 35123d7fc65d4..eb66834222f3e 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
   // The following macro is used to dispatch the conversion function based on
diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh
index cde26dbda18cf..e32684eaed24d 100644
--- a/csrc/quantization/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh
@@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
   // The following macro is used to dispatch the conversion function based on
diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
index 714907428a1ab..8ed918b3d7c27 100644
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -203,7 +203,8 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 #endif
       mat.data_ptr<int>(),
 #ifndef USE_ROCM
-      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
+      (half2*)mul.data_ptr<at::Half>(),
+      (__half*)lookup_table.data_ptr<at::Half>(),
 #else
       (float2*)mul.data_ptr<float>(),
       (__half*)lookup_table.data_ptr<at::Half>(),

From 5895b24677ff7771fa414f611a241856e225e6ea Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 30 Jul 2024 22:33:01 +0400
Subject: [PATCH 090/167] [OpenVINO] Updated OpenVINO requirements and build
 docs (#6948)

---
 Dockerfile.openvino                           |  4 +--
 .../getting_started/openvino-installation.rst |  2 +-
 requirements-openvino.txt                     | 28 ++++++++++++++++++-
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index cfb786485266c..7c62dd845aa99 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -1,7 +1,7 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
-FROM ubuntu:20.04 AS dev
+FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
     apt-get install -y python3-pip git
@@ -18,7 +18,7 @@ COPY setup.py /workspace/vllm/
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
 
 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
index 0d8e0b680ff0d..62256df091a44 100644
--- a/docs/source/getting_started/openvino-installation.rst
+++ b/docs/source/getting_started/openvino-installation.rst
@@ -57,7 +57,7 @@ Install from source
 
   .. code-block:: console
 
-      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
 
 .. _openvino_backend_performance_tips:
 
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index fabac3c7bbaf9..a86c6cb580484 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -1,7 +1,33 @@
 # Common dependencies
--r requirements-common.txt
+# -r requirements-common.txt
+# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
+cmake >= 3.21
+ninja  # For faster builds.
+psutil
+sentencepiece  # Required for LLaMA tokenizer.
+numpy < 2.0.0
+requests
+tqdm
+py-cpuinfo
+transformers < 4.43
+tokenizers >= 0.19.1  # Required for Llama 3.
+fastapi
+aiohttp
+openai
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+pillow  # Required for image processing
+prometheus_client >= 0.18.0
+prometheus-fastapi-instrumentator >= 7.0.0
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.10.3
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+typing_extensions
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
 
 # OpenVINO dependencies
 torch >= 2.1.2
 openvino ~= 2024.3.0.dev
+openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
 optimum-intel[openvino] >= 1.18.1

From 052b6f8ca4041f90a1d6825342a4836befbcf478 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Tue, 30 Jul 2024 14:48:50 -0400
Subject: [PATCH 091/167] [Bugfix] Fix tensorizer memory profiling bug during
 testing (#6881)

---
 tests/tensorizer_loader/conftest.py        |  35 +++--
 tests/tensorizer_loader/test_tensorizer.py | 169 +++++++++++----------
 2 files changed, 110 insertions(+), 94 deletions(-)

diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index c5c6fc1057d31..b46116391db26 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -1,6 +1,5 @@
-# isort: skip_file
-
 import contextlib
+import functools
 import gc
 
 import pytest
@@ -12,34 +11,38 @@
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
 
+@pytest.fixture(autouse=True)
 def cleanup():
     destroy_model_parallel()
     destroy_distributed_environment()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
+    ray.shutdown()
     gc.collect()
     torch.cuda.empty_cache()
-    ray.shutdown()
 
 
-@pytest.fixture()
-def should_do_global_cleanup_after_test(request) -> bool:
-    """Allow subdirectories to skip global cleanup by overriding this fixture.
-    This can provide a ~10x speedup for non-GPU unit tests since they don't need
-    to initialize torch.
-    """
+def retry_until_skip(n):
 
-    return True
+    def decorator_retry(func):
 
+        @functools.wraps(func)
+        def wrapper_retry(*args, **kwargs):
+            for i in range(n):
+                try:
+                    return func(*args, **kwargs)
+                except AssertionError:
+                    gc.collect()
+                    torch.cuda.empty_cache()
+                    if i == n - 1:
+                        pytest.skip("Skipping test after attempts..")
 
-@pytest.fixture(autouse=True)
-def cleanup_fixture(should_do_global_cleanup_after_test: bool):
-    yield
-    if should_do_global_cleanup_after_test:
-        cleanup()
+        return wrapper_retry
+
+    return decorator_retry
 
 
 @pytest.fixture(autouse=True)
 def tensorizer_config():
     config = TensorizerConfig(tensorizer_uri="vllm")
-    return config
\ No newline at end of file
+    return config
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 2adeae8874bdb..32591ecfe6774 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,3 +1,4 @@
+import gc
 import json
 import os
 import pathlib
@@ -20,13 +21,13 @@
                                                          serialize_vllm_model,
                                                          tensorize_vllm_model)
 
-from ..conftest import VllmRunner, cleanup
+from ..conftest import VllmRunner
 from ..utils import RemoteOpenAIServer
+from .conftest import retry_until_skip
 
 # yapf conflicts with isort for this docstring
 
 
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -40,6 +41,7 @@
 tensorize_model_for_testing_script = os.path.join(
     os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
 
+
 def is_curl_installed():
     try:
         subprocess.check_call(['curl', '--version'])
@@ -47,14 +49,16 @@ def is_curl_installed():
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
 
+
 def get_torch_model(vllm_runner: VllmRunner):
     return vllm_runner \
-            .model \
-            .llm_engine \
-            .model_executor \
-            .driver_worker \
-            .model_runner \
-            .model
+        .model \
+        .llm_engine \
+        .model_executor \
+        .driver_worker \
+        .model_runner \
+        .model
+
 
 def write_keyfile(keyfile_path: str):
     encryption_params = EncryptionParams.random()
@@ -63,7 +67,6 @@ def write_keyfile(keyfile_path: str):
         f.write(encryption_params.key)
 
 
-
 @patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
 def test_load_with_tensorizer(mock_agent, tensorizer_config):
     mock_linear_method = MagicMock()
@@ -85,14 +88,15 @@ def test_can_deserialize_s3(vllm_runner):
     tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
 
     with vllm_runner(model_ref,
-                                  load_format="tensorizer",
-                                  model_loader_extra_config=TensorizerConfig(
-                                      tensorizer_uri=tensorized_path,
-                                      num_readers=1,
-                                      s3_endpoint="object.ord1.coreweave.com",
-                                  )) as loaded_hf_model:
-
-        deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501
+                     load_format="tensorizer",
+                     model_loader_extra_config=TensorizerConfig(
+                         tensorizer_uri=tensorized_path,
+                         num_readers=1,
+                         s3_endpoint="object.ord1.coreweave.com",
+                     )) as loaded_hf_model:
+        deserialized_outputs = loaded_hf_model.generate(prompts,
+                                                        sampling_params)
+        # noqa: E501
 
         assert deserialized_outputs
 
@@ -100,7 +104,6 @@ def test_can_deserialize_s3(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
-    cleanup()
     with vllm_runner(model_ref) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
         key_path = tmp_path / (model_ref + ".key")
@@ -113,18 +116,19 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
             encryption_keyfile=key_path
         )
         serialize_vllm_model(get_torch_model(vllm_model),
-                            config_for_serializing)
-
+                             config_for_serializing)
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
 
     with vllm_runner(
-        model_ref,
-        load_format="tensorizer",
-        model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501
+            model_ref,
+            load_format="tensorizer",
+            model_loader_extra_config=config_for_deserializing) as loaded_vllm_model:  # noqa: E501
 
-        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
+        deserialized_outputs = loaded_vllm_model.generate(prompts,
+                                                          sampling_params)
+        # noqa: E501
 
         assert outputs == deserialized_outputs
 
@@ -140,12 +144,11 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
             serializer.write_module(hf_model.model)
 
     with vllm_runner(model_ref,
-                                  load_format="tensorizer",
-                                  model_loader_extra_config=TensorizerConfig(
-                                      tensorizer_uri=model_path,
-                                      num_readers=1,
-                                  )) as loaded_hf_model:
-
+                     load_format="tensorizer",
+                     model_loader_extra_config=TensorizerConfig(
+                         tensorizer_uri=model_path,
+                         num_readers=1,
+                     )) as loaded_hf_model:
         deserialized_outputs = loaded_hf_model.generate_greedy(
             prompts, max_tokens=max_tokens)
 
@@ -167,21 +170,21 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
         model_path = tmp_path / (model_ref + ".tensors")
 
         serialize_vllm_model(get_torch_model(vllm_model),
-                            TensorizerConfig(tensorizer_uri=model_path))
+                             TensorizerConfig(tensorizer_uri=model_path))
 
     with vllm_runner(
-        model_ref,
-        load_format="tensorizer",
-        model_loader_extra_config=TensorizerConfig(
-            tensorizer_uri=model_path,
-            num_readers=1,
-        ),
-        enable_lora=True,
-        max_loras=1,
-        max_lora_rank=8,
-        max_cpu_loras=2,
-        max_num_seqs=50,
-        max_model_len=1000,
+            model_ref,
+            load_format="tensorizer",
+            model_loader_extra_config=TensorizerConfig(
+                tensorizer_uri=model_path,
+                num_readers=1,
+            ),
+            enable_lora=True,
+            max_loras=1,
+            max_lora_rank=8,
+            max_cpu_loras=2,
+            max_num_seqs=50,
+            max_model_len=1000,
     ) as loaded_vllm_model:
         process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
 
@@ -189,10 +192,14 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
 
 
 def test_load_without_tensorizer_load_format(vllm_runner):
+    model = None
     with pytest.raises(ValueError):
-        vllm_runner(
+        model = vllm_runner(
             model_ref,
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
@@ -202,7 +209,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
         model_path = tmp_path / (model_ref + ".tensors")
 
         serialize_vllm_model(get_torch_model(vllm_model),
-                            TensorizerConfig(tensorizer_uri=model_path))
+                             TensorizerConfig(tensorizer_uri=model_path))
 
         model_loader_extra_config = {
             "tensorizer_uri": str(model_path),
@@ -220,9 +227,9 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
 
         client = server.get_client()
         completion = client.completions.create(model=model_ref,
-                                            prompt="Hello, my name is",
-                                            max_tokens=5,
-                                            temperature=0.0)
+                                               prompt="Hello, my name is",
+                                               max_tokens=5,
+                                               temperature=0.0)
 
         assert completion.id is not None
         assert len(completion.choices) == 1
@@ -233,11 +240,15 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
 
 
 def test_raise_value_error_on_invalid_load_format(vllm_runner):
+    model = None
     with pytest.raises(ValueError):
-        vllm_runner(
+        model = vllm_runner(
             model_ref,
             load_format="safetensors",
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -259,22 +270,20 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
             disable_custom_all_reduce=True,
         )
 
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Requires 2 GPUs")
 def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
                                                                     tmp_path):
     model_ref = "EleutherAI/pythia-1.4b"
     # record outputs from un-sharded un-tensorized model
-    base_model = vllm_runner(
-        model_ref,
-        disable_custom_all_reduce=True,
-        enforce_eager=True,
-    )
-    outputs = base_model.generate(prompts, sampling_params)
-
-    base_model.model.llm_engine.model_executor.shutdown()
-    del base_model
-    cleanup()
+    with vllm_runner(
+            model_ref,
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+    ) as base_model:
+        outputs = base_model.generate(prompts, sampling_params)
+        base_model.model.llm_engine.model_executor.shutdown()
 
     # load model with two shards and serialize with encryption
     model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@@ -287,32 +296,34 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
 
     tensorize_vllm_model(
         engine_args=EngineArgs(
-                model=model_ref,
-                tensor_parallel_size=2,
-                disable_custom_all_reduce=True,
-                enforce_eager=True,
-            ),
+            model=model_ref,
+            tensor_parallel_size=2,
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+        ),
         tensorizer_config=tensorizer_config,
     )
     assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
     assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
-    cleanup()
-
-    loaded_vllm_model = vllm_runner(
-        model_ref,
-        tensor_parallel_size=2,
-        load_format="tensorizer",
-        disable_custom_all_reduce=True,
-        enforce_eager=True,
-        model_loader_extra_config=tensorizer_config)
 
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+    with vllm_runner(
+            model_ref,
+            tensor_parallel_size=2,
+            load_format="tensorizer",
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+            model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts,
+                                                          sampling_params)
 
     assert outputs == deserialized_outputs
 
 
+
+@retry_until_skip(3)
 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
-    cleanup()
+    gc.collect()
+    torch.cuda.empty_cache()
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))
@@ -324,8 +335,10 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
         assert is_vllm_tensorized(config)
 
     with vllm_runner(model_ref,
-                    load_format="tensorizer",
-                    model_loader_extra_config=config) as loaded_vllm_model:
-        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
+                     load_format="tensorizer",
+                     model_loader_extra_config=config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts,
+                                                          sampling_params)
+        # noqa: E501
 
         assert outputs == deserialized_outputs

From d7a299edaa5d23f3d7d5c98b53872a8ced9aad80 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 30 Jul 2024 16:37:01 -0400
Subject: [PATCH 092/167] [Kernel] Remove scaled_fp8_quant kernel padding
 footgun (#6842)

---
 tests/quantization/test_fp8.py                |  2 +-
 vllm/_custom_ops.py                           | 24 ++++++++++---------
 .../layers/quantization/utils/w8a8_utils.py   |  5 ++--
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index ad92f1f189f65..a020f7bf37262 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -123,7 +123,7 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
     assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Padding
-    y, _ = ops.scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
     assert y.shape[0] == 17
     assert torch.allclose(
         ref_y,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ad9f01be6ddd4..6ca667eb85640 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -307,7 +307,7 @@ def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 def scaled_fp8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
-    batch_dim_padding: Optional[int] = None,
+    num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -317,7 +317,7 @@ def scaled_fp8_quant(
     This function supports both static and dynamic quantization: If you
     provide the scale, it will use static scaling and if you omit it,
     the scale will be determined dynamically. The function also allows
-    optional padding of the output tensor for downstream kernels that
+    optional padding of the output tensors for downstream kernels that
     will benefit from padding.
 
     Args:
@@ -325,7 +325,7 @@ def scaled_fp8_quant(
         scale: Optional scaling factor for the FP8 quantization
         scale_ub: Optional upper bound for scaling factor in dynamic 
             per token case
-        batch_dim_padding: If specified, pad the first dimension
+        num_token_padding: If specified, pad the first dimension
             of the output to at least this value.
         use_per_token_if_dynamic: Whether to do per_tensor or per_token 
             in the dynamic quantization case.
@@ -334,16 +334,16 @@ def scaled_fp8_quant(
         Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
-    if batch_dim_padding:
-        shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:])
-        output = torch.empty(shape,
-                             device=input.device,
-                             dtype=torch.float8_e4m3fn)
-    else:
-        output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape = input.shape
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    output = torch.empty(shape, device=input.device, dtype=torch.float8_e4m3fn)
+
     if scale is None:
         if use_per_token_if_dynamic:
-            scale = torch.empty((input.numel() // input.shape[-1], 1),
+            scale = torch.empty((shape[0], 1),
                                 device=input.device,
                                 dtype=torch.float32)
             torch.ops._C.dynamic_per_token_scaled_fp8_quant(
@@ -352,6 +352,8 @@ def scaled_fp8_quant(
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
+        # num_token_padding not implemented for this case
+        assert (scale.numel() == 1 or num_token_padding is None)
         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
 
     return output, scale
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 20100c76bd690..dbe86902853cd 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -139,7 +139,7 @@ def apply_fp8_linear(
         qinput, x_scale = ops.scaled_fp8_quant(
             input,
             input_scale,
-            batch_dim_padding=17,
+            num_token_padding=17,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         per_tensor_weights = (weight_scale.numel() == 1)
@@ -177,8 +177,9 @@ def apply_fp8_linear(
             output, _ = torch._scaled_mm(qinput,
                                          weight,
                                          out_dtype=torch.float32)
-            # Unpad (undo batch_dim_padding)
+            # Unpad (undo num_token_padding)
             output = torch.narrow(output, 0, 0, input.shape[0])
+            x_scale = torch.narrow(x_scale, 0, 0, input.shape[0])
 
             # DQ
             # C = sw * sx * (X * W) + bias

From 6ca8031e7130f810ce7248286754a60044f65c73 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 30 Jul 2024 14:32:12 -0700
Subject: [PATCH 093/167] [core][misc] improve free_finished_seq_groups (#6865)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/core/scheduler.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 5b7b569c3e08d..5cdf1d15c31e1 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -313,6 +313,7 @@ def __init__(
         # Sequence groups finished requests ids since last step iteration.
         # It lets the model know that any state associated with these requests
         # can and must be released after the current step.
+        # This is used to evict the finished requests from the Mamba cache.
         self._finished_requests_ids: List[str] = list()
         # Time at previous scheduling step
         self.prev_time = 0.0
@@ -374,6 +375,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
             for aborted_group in aborted_groups:
                 # Remove the sequence group from the state queue.
                 state_queue.remove(aborted_group)
+                # Remove the aborted request from the Mamba cache.
                 self._finished_requests_ids.append(aborted_group.request_id)
                 for seq in aborted_group.get_seqs():
                     if seq.is_finished():
@@ -1057,13 +1059,16 @@ def free_seq(self, seq: Sequence) -> None:
         self.block_manager.free(seq)
 
     def free_finished_seq_groups(self) -> None:
-        for queue in [self.running, self.swapped, self.waiting]:
-            self._finished_requests_ids += [
-                seq_group.request_id for seq_group in queue
-                if seq_group.is_finished()
-            ]
-        self.running = deque(seq_group for seq_group in self.running
-                             if not seq_group.is_finished())
+        remaining: Deque[SequenceGroup] = deque()
+        for seq_group in self.running:
+            if seq_group.is_finished():
+                # Add the finished requests to the finished requests list.
+                # This list will be used to update the Mamba cache in the
+                # next step.
+                self._finished_requests_ids.append(seq_group.request_id)
+            else:
+                remaining.append(seq_group)
+        self.running = remaining
 
     def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
         self.block_manager.allocate(seq_group)

From 40c27a7cbb496ede63da9d636c07a1f315fd36e1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 30 Jul 2024 14:59:48 -0700
Subject: [PATCH 094/167] [Build] Temporarily Disable Kernels and LoRA tests
 (#6961)

---
 .buildkite/test-pipeline.yaml | 40 +++++++++++++++++------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index be8807df0b098..91418e5ec1752 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -155,12 +155,12 @@ steps:
     - pytest -v -s test_inputs.py
     - pytest -v -s multimodal
 
-- label: Kernels Test %N
-  #mirror_hardwares: [amd]
-  commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+# - label: Kernels Test %N
+#   #mirror_hardwares: [amd]
+#   commands:
+#     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+#     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+#   parallelism: 4
 
 - label: Models Test
   #mirror_hardwares: [amd]
@@ -202,20 +202,20 @@ steps:
     - export VLLM_ATTENTION_BACKEND=XFORMERS
     - pytest -v -s spec_decode
 
-- label: LoRA Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
-
-- label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_long_context.py
+# - label: LoRA Test %N
+#   #mirror_hardwares: [amd]
+#   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+#   parallelism: 4
+
+# - label: LoRA Long Context (Distributed)
+#   #mirror_hardwares: [amd]
+#   num_gpus: 4
+#   # This test runs llama 13B, so it is required to run on 4 GPUs.
+#   commands:
+#     # FIXIT: find out which code initialize cuda before running the test
+#     # before the fix, we need to use spawn to test it
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]

From 79319cedfabbd7772234c238921e48d6549ece49 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 30 Jul 2024 16:28:05 -0700
Subject: [PATCH 095/167] [Nightly benchmarking suite] Remove pkill python from
 run benchmark suite (#6965)

---
 .buildkite/nightly-benchmarks/run-benchmarks-suite.sh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 04b02adf3644c..53a45b90b15f7 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -73,11 +73,6 @@ kill_gpu_processes() {
       echo "All GPU processes have been killed."
   fi
 
-  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
-  # since we are in container anyway
-  pkill -9 -f python
-  pkill -9 -f python3
-
   # waiting for GPU processes to be fully killed
   # loop while nvidia-smi returns any processes
   while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do

From fb4f530bf5004a9afef1380cb0a84bfb98a89c63 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 30 Jul 2024 16:28:49 -0700
Subject: [PATCH 096/167] [CI] [nightly benchmark] Do not re-download sharegpt
 dataset if exists (#6706)

---
 .buildkite/nightly-benchmarks/run-benchmarks-suite.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 53a45b90b15f7..1a88d038b4b52 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -34,6 +34,15 @@ check_hf_token() {
   fi
 }
 
+ensure_sharegpt_downloaded() {
+  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
+  if [ ! -f "$FILE" ]; then
+      wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+  else
+      echo "$FILE already exists."
+  fi
+}
+
 json2args() {
   # transforms the JSON string to command line args, and '_' is replaced to '-'
   # example:
@@ -350,7 +359,7 @@ main() {
 
   # prepare for benchmarking
   cd benchmarks || exit 1
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+  ensure_sharegpt_downloaded
   declare -g RESULTS_FOLDER=results/
   mkdir -p $RESULTS_FOLDER
   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/

From c32ab8be1ada40552e8a7c104a5de4830da2c7df Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 30 Jul 2024 17:53:21 -0700
Subject: [PATCH 097/167] [Speculative decoding] Add serving benchmark for
 llama3 70b + speculative decoding (#6964)

---
 .../tests/serving-tests.json                  | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index 86a0fefa339f7..300af0524d7c0 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -55,5 +55,26 @@
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200
         }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
+        "qps_list": [2],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "disable_log_requests": "", 
+            "tensor_parallel_size": 4,
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1,
+            "use_v2_block_manager": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200 
+        }
     }
-]
\ No newline at end of file
+]

From da1f7cc12a12ea4a744d26122e9a13ea4b3f4c7b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 31 Jul 2024 10:38:03 +0800
Subject: [PATCH 098/167] [mypy] Enable following imports for some directories
 (#6681)

---
 .github/workflows/mypy.yaml                   | 31 ++++------
 format.sh                                     | 30 ++++-----
 pyproject.toml                                | 18 +++++-
 vllm/_custom_ops.py                           |  2 +-
 vllm/_ipex_ops.py                             | 62 +++++++++++++++----
 vllm/adapter_commons/models.py                | 30 ++++-----
 vllm/adapter_commons/request.py               | 10 +--
 vllm/adapter_commons/worker_manager.py        | 14 ++---
 vllm/config.py                                |  3 +-
 vllm/engine/llm_engine.py                     | 14 ++---
 vllm/entrypoints/openai/serving_engine.py     |  4 +-
 vllm/scripts.py                               | 10 +--
 vllm/transformers_utils/detokenizer.py        |  2 +
 .../tokenizer_group/__init__.py               |  9 ++-
 .../tokenizer_group/base_tokenizer_group.py   | 18 +++---
 .../tokenizer_group/ray_tokenizer_group.py    | 30 ++++-----
 .../tokenizer_group/tokenizer_group.py        | 28 ++++-----
 vllm/utils.py                                 | 13 ++--
 18 files changed, 185 insertions(+), 143 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 5780f09a646cb..721c9c026cf16 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,22 +32,17 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
-        mypy tests --config-file pyproject.toml
-        mypy vllm/*.py --config-file pyproject.toml
-        mypy vllm/attention --config-file pyproject.toml
-        mypy vllm/core --config-file pyproject.toml
-        mypy vllm/distributed --config-file pyproject.toml
-        mypy vllm/engine  --config-file pyproject.toml
-        mypy vllm/entrypoints --config-file pyproject.toml
-        mypy vllm/executor --config-file pyproject.toml
-        mypy vllm/inputs --config-file pyproject.toml
-        mypy vllm/logging --config-file pyproject.toml
-        mypy vllm/lora --config-file pyproject.toml
-        mypy vllm/model_executor  --config-file pyproject.toml
-        mypy vllm/multimodal --config-file pyproject.toml
-        mypy vllm/platforms --config-file pyproject.toml
-        mypy vllm/spec_decode --config-file pyproject.toml
-        mypy vllm/transformers_utils --config-file pyproject.toml
-        mypy vllm/usage --config-file pyproject.toml
-        mypy vllm/worker --config-file pyproject.toml
+        mypy tests --follow-imports skip
+        mypy vllm/attention --follow-imports skip
+        mypy vllm/core --follow-imports skip
+        mypy vllm/distributed --follow-imports skip
+        mypy vllm/engine  --follow-imports skip
+        mypy vllm/entrypoints --follow-imports skip
+        mypy vllm/executor --follow-imports skip
+        mypy vllm/lora --follow-imports skip
+        mypy vllm/model_executor  --follow-imports skip
+        mypy vllm/prompt_adapter --follow-imports skip
+        mypy vllm/spec_decode --follow-imports skip
+        mypy vllm/worker --follow-imports skip
+        mypy
 
diff --git a/format.sh b/format.sh
index 5ad6d6f2938bb..71697cffacfb4 100755
--- a/format.sh
+++ b/format.sh
@@ -96,23 +96,19 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
-mypy tests --config-file pyproject.toml
-mypy vllm/*.py --config-file pyproject.toml
-mypy vllm/attention --config-file pyproject.toml
-mypy vllm/core --config-file pyproject.toml
-mypy vllm/distributed --config-file pyproject.toml
-mypy vllm/engine  --config-file pyproject.toml
-mypy vllm/entrypoints --config-file pyproject.toml
-mypy vllm/executor --config-file pyproject.toml
-mypy vllm/logging --config-file pyproject.toml
-mypy vllm/lora --config-file pyproject.toml
-mypy vllm/model_executor  --config-file pyproject.toml
-mypy vllm/multimodal --config-file pyproject.toml
-mypy vllm/prompt_adapter --config-file pyproject.toml
-mypy vllm/spec_decode --config-file pyproject.toml
-mypy vllm/transformers_utils --config-file pyproject.toml
-mypy vllm/usage --config-file pyproject.toml
-mypy vllm/worker --config-file pyproject.toml
+mypy tests --follow-imports skip
+mypy vllm/attention --follow-imports skip
+mypy vllm/core --follow-imports skip
+mypy vllm/distributed --follow-imports skip
+mypy vllm/engine  --follow-imports skip
+mypy vllm/entrypoints --follow-imports skip
+mypy vllm/executor --follow-imports skip
+mypy vllm/lora --follow-imports skip
+mypy vllm/model_executor  --follow-imports skip
+mypy vllm/prompt_adapter --follow-imports skip
+mypy vllm/spec_decode --follow-imports skip
+mypy vllm/worker --follow-imports skip
+mypy
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/pyproject.toml b/pyproject.toml
index 1ba1eacd90084..cd5d196a16200 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,9 +48,23 @@ python_version = "3.8"
 
 ignore_missing_imports = true
 check_untyped_defs = true
-follow_imports = "skip"
+follow_imports = "silent"
 
-files = "vllm"
+# After fixing type errors resulting from follow_imports: "skip" -> "silent",
+# move the directory here and remove it from format.sh and mypy.yaml
+files = [
+    "vllm/*.py",
+    "vllm/adapter_commons",
+    "vllm/assets",
+    "vllm/inputs",
+    "vllm/logging",
+    "vllm/multimodal",
+    "vllm/platforms",
+    "vllm/server",
+    "vllm/transformers_utils",
+    "vllm/triton_utils",
+    "vllm/usage",
+]
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 exclude = [
     "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6ca667eb85640..e351d602189e2 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -239,7 +239,7 @@ def cutlass_scaled_mm(a: torch.Tensor,
                       b: torch.Tensor,
                       scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
-                      out_dtype: Type[torch.dtype],
+                      out_dtype: torch.dtype,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index b4721b4e1aedd..2156f6b18adb6 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -25,27 +25,33 @@ def _reshape_activation_tensor(
         x2 = x2.reshape(num, d)
         return x1, x2
 
+    @staticmethod
     def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
         x1, x2 = ipex_ops._reshape_activation_tensor(x)
         ipex.llm.functional.silu_mul(x1, x2, out)
 
+    @staticmethod
     def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
         x1, x2 = ipex_ops._reshape_activation_tensor(x)
         ipex.llm.functional.gelu_mul(x1, x2, out, "none")
 
+    @staticmethod
     def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
         x1, x2 = ipex_ops._reshape_activation_tensor(x)
         ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
 
+    @staticmethod
     def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
         out.copy_(torch.nn.functional.gelu(x))
 
+    @staticmethod
     def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
         out.copy_(torch.nn.functional.gelu(x))
 
     # TODO add implementation of gelu_quick here
     # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
 
+    @staticmethod
     def paged_attention_v1(
         out: torch.Tensor,
         query: torch.Tensor,
@@ -78,12 +84,21 @@ def paged_attention_v1(
         ).view(num_kv_heads,
                1).repeat_interleave(num_queries_per_tokens).flatten()
         # todo: ipex will refactor namespace
-        torch.xpu.paged_attention_v1(out, query.contiguous(),
-                                     key_cache.view_as(value_cache),
-                                     value_cache, head_mapping, scale,
-                                     block_tables, context_lens, block_size,
-                                     max_context_len, alibi_slopes)
+        torch.xpu.paged_attention_v1(  # type: ignore
+            out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            head_mapping,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
 
+    @staticmethod
     def paged_attention_v2(
         out: torch.Tensor,
         exp_sum: torch.Tensor,
@@ -119,13 +134,24 @@ def paged_attention_v2(
         ).view(num_kv_heads,
                1).repeat_interleave(num_queries_per_tokens).flatten()
         # todo: ipex will refactor namespace
-        torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out,
-                                     query.contiguous(),
-                                     key_cache.view_as(value_cache),
-                                     value_cache, head_mapping, block_tables,
-                                     context_lens, scale, block_size,
-                                     max_context_len, alibi_slopes)
+        torch.xpu.paged_attention_v2(  # type: ignore
+            out,
+            exp_sum,
+            max_logits,
+            tmp_out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            head_mapping,
+            block_tables,
+            context_lens,
+            scale,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
 
+    @staticmethod
     def rotary_embedding(
         positions: torch.Tensor,  # [batch_size, seq_len]
         query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
@@ -158,6 +184,7 @@ def rotary_embedding(
         ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
                                              rotary_dim, is_neox, positions)
 
+    @staticmethod
     def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                                  key: torch.Tensor, head_size: int,
                                  cos_sin_cache: torch.Tensor, is_neox: bool,
@@ -189,17 +216,20 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
         ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
                                              rotary_dim, is_neox, positions)
 
+    @staticmethod
     def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
                  epsilon: float) -> None:
         tmp = ipex.llm.functional.rms_norm(input, weight, epsilon)
         out.copy_(tmp)
 
+    @staticmethod
     def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
                            weight: torch.Tensor, epsilon: float) -> None:
         tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
                                                epsilon, True)
         input.copy_(tmp)
 
+    @staticmethod
     def varlen_attention(
         query: torch.Tensor,
         key: torch.Tensor,
@@ -222,6 +252,7 @@ def varlen_attention(
                                              softmax_scale, zero_tensors,
                                              is_causal, return_softmax, gen_)
 
+    @staticmethod
     def reshape_and_cache(
         key: torch.Tensor,
         value: torch.Tensor,
@@ -240,8 +271,13 @@ def reshape_and_cache(
     def copy_blocks(key_caches: List[torch.Tensor],
                     value_caches: List[torch.Tensor],
                     block_mapping: torch.Tensor) -> None:
-        torch.xpu.copy_blocks(key_caches, value_caches, block_mapping)
+        torch.xpu.copy_blocks(  # type: ignore
+            key_caches,
+            value_caches,
+            block_mapping,
+        )
 
+    @staticmethod
     def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                     block_mapping: torch.Tensor) -> None:
-        torch.xpu.swap_blocks(src, dst, block_mapping)
+        torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index 6939b1405f3e1..a5c04ab78fbe8 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -31,7 +31,7 @@ def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable],
         super().__init__(capacity)
         self.deactivate_fn = deactivate_fn
 
-    def _on_remove(self, key: Hashable, value: T):
+    def _on_remove(self, key: Hashable, value: Optional[T]):
         logger.debug("Removing adapter int id: %d", key)
         self.deactivate_fn(key)
         return super()._on_remove(key, value)
@@ -59,46 +59,46 @@ def __len__(self) -> int:
 
     @property
     @abstractmethod
-    def adapter_slots(self):
-        ...
+    def adapter_slots(self) -> int:
+        raise NotImplementedError
 
     @property
     @abstractmethod
-    def capacity(self):
-        ...
+    def capacity(self) -> int:
+        raise NotImplementedError
 
     @abstractmethod
     def activate_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def deactivate_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def add_adapter(self, adapter: Any) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def set_adapter_mapping(self, mapping: Any) -> None:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def remove_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
-    def remove_all_adapters(self):
-        ...
+    def remove_all_adapters(self) -> None:
+        raise NotImplementedError
 
     @abstractmethod
     def get_adapter(self, adapter_id: int) -> Optional[Any]:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def list_adapters(self) -> Dict[int, Any]:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def pin_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
index 69775ab7d4548..f98adeba1c705 100644
--- a/vllm/adapter_commons/request.py
+++ b/vllm/adapter_commons/request.py
@@ -1,19 +1,19 @@
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 
 
 @dataclass
-class AdapterRequest:
+class AdapterRequest(ABC):
     """
     Base class for adapter requests.
     """
 
     @property
     @abstractmethod
-    def adapter_id(self):
-        ...
+    def adapter_id(self) -> int:
+        raise NotImplementedError
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.adapter_id < 1:
             raise ValueError(f"id must be > 0, got {self.adapter_id}")
 
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
index acf18993af6d7..83929e82ebf04 100644
--- a/vllm/adapter_commons/worker_manager.py
+++ b/vllm/adapter_commons/worker_manager.py
@@ -12,25 +12,25 @@ def __init__(self, device: torch.device):
     @property
     @abstractmethod
     def is_enabled(self) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def set_active_adapters(self, requests: Set[Any],
                             mapping: Optional[Any]) -> None:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def add_adapter(self, adapter_request: Any) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def remove_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
-    def remove_all_adapters(self):
-        ...
+    def remove_all_adapters(self) -> None:
+        raise NotImplementedError
 
     @abstractmethod
     def list_adapters(self) -> Set[int]:
-        ...
+        raise NotImplementedError
diff --git a/vllm/config.py b/vllm/config.py
index e7b54e04b00d5..fd48cc3a6b371 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -724,7 +724,7 @@ def __init__(
                         backend)
 
         self._verify_args()
-        self.rank = 0
+        self.rank: int = 0
 
     @property
     def use_ray(self) -> bool:
@@ -850,6 +850,7 @@ def _verify_args(self) -> None:
 
 
 class DeviceConfig:
+    device: Optional[torch.device]
 
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 004348d4c49a3..1efe2206abe81 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -5,8 +5,6 @@
 from typing import Sequence as GenericSequence
 from typing import Set, Type, TypeVar, Union
 
-from transformers import PreTrainedTokenizer
-
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
@@ -40,7 +38,8 @@
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
+from vllm.transformers_utils.tokenizer_group import (AnyTokenizer,
+                                                     BaseTokenizerGroup,
                                                      get_tokenizer_group)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
@@ -477,13 +476,12 @@ def get_tokenizer_group(
         return self.tokenizer
 
     def get_tokenizer(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
 
-    def get_tokenizer_for_seq(self,
-                              sequence: Sequence) -> "PreTrainedTokenizer":
+    def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
         return self.get_tokenizer_group().get_lora_tokenizer(
             sequence.lora_request)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 321c9ac2c1d5f..b374a7946b11e 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -5,7 +5,6 @@
 from typing import Iterable, Iterator, List, Optional, Tuple, TypedDict, Union
 
 from pydantic import Field
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
@@ -30,6 +29,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
+from vllm.transformers_utils.tokenizer_group import AnyTokenizer
 
 logger = init_logger(__name__)
 
@@ -49,8 +49,6 @@ class LoRAModulePath:
 AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest,
                    EmbeddingRequest, TokenizeRequest]
 
-AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-
 
 class TextTokensPrompt(TypedDict):
     prompt: str
diff --git a/vllm/scripts.py b/vllm/scripts.py
index aefa5cec93a57..403b22239aed0 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -4,9 +4,10 @@
 import os
 import signal
 import sys
-from typing import Optional
+from typing import List, Optional
 
 from openai import OpenAI
+from openai.types.chat import ChatCompletionMessageParam
 
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import make_arg_parser
@@ -63,15 +64,14 @@ def complete(model_name: str, client: OpenAI) -> None:
 
 def chat(system_prompt: Optional[str], model_name: str,
          client: OpenAI) -> None:
-    conversation = []
+    conversation: List[ChatCompletionMessageParam] = []
     if system_prompt is not None:
         conversation.append({"role": "system", "content": system_prompt})
 
     print("Please enter a message for the chat model:")
     while True:
         input_message = input("> ")
-        message = {"role": "user", "content": input_message}
-        conversation.append(message)
+        conversation.append({"role": "user", "content": input_message})
 
         chat_completion = client.chat.completions.create(model=model_name,
                                                          messages=conversation)
@@ -79,7 +79,7 @@ def chat(system_prompt: Optional[str], model_name: str,
         response_message = chat_completion.choices[0].message
         output = response_message.content
 
-        conversation.append(response_message)
+        conversation.append(response_message)  # type: ignore
         print(output)
 
 
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 0a45028e7759b..76f418674532f 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -37,6 +37,8 @@ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
             The prompt logprobs with the decoded tokens.
         """
         prms = seq_group.sampling_params
+        assert prms is not None
+
         # We can pick any sequence for the prompt.
         seq = next(iter(seq_group.seqs_dict.values()))
         # Only prompt, without the generated token.
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 9f54f5409b181..7a0436dd1fb16 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -2,10 +2,9 @@
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-    BaseTokenizerGroup)
-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
-    TokenizerGroup)
+
+from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
+from .tokenizer_group import TokenizerGroup
 
 if ray:
     from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
@@ -34,4 +33,4 @@ def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
     return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
 
 
-__all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
+__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index 9682db6966ddf..abbcdf2807f6f 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -1,11 +1,13 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import List, Optional, Union
 
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.config import TokenizerPoolConfig
 from vllm.lora.request import LoRARequest
 
+AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+
 
 class BaseTokenizerGroup(ABC):
     """A group of tokenizers that can be used for LoRA adapters."""
@@ -47,17 +49,17 @@ async def encode_async(
 
     @abstractmethod
     def get_lora_tokenizer(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         """Get a tokenizer for a LoRA request."""
         pass
 
     @abstractmethod
     async def get_lora_tokenizer_async(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         """Get a tokenizer for a LoRA request."""
         pass
 
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 32384398a4c12..eebdf7bf644d0 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -6,18 +6,16 @@
     from ray.exceptions import ActorDiedError
 except ImportError:
     # For older versions of Ray
-    from ray.exceptions import RayActorError as ActorDiedError
+    from ray.exceptions import RayActorError as ActorDiedError  # type: ignore
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
-from transformers import PreTrainedTokenizer
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-    BaseTokenizerGroup)
-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
-    TokenizerGroup)
+
+from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
+from .tokenizer_group import TokenizerGroup
 
 logger = init_logger(__name__)
 
@@ -67,7 +65,7 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
             **self._tokenizer_config, )
 
         self._ray_tokenizer_group_cls = ray.remote(
-            self._worker_cls).options(**ray_actor_options)
+            self._worker_cls).options(**ray_actor_options)  # type: ignore
         self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
         self._idle_actors: Optional[asyncio.Queue] = None
 
@@ -83,8 +81,10 @@ def pool_size(self) -> int:
         return len(self.tokenizer_actors)
 
     def ping(self):
-        return ray.get(
-            [actor.ping.remote() for actor in self.tokenizer_actors])
+        return ray.get([
+            actor.ping.remote()  # type: ignore
+            for actor in self.tokenizer_actors
+        ])
 
     def _ensure_queue_initialized(self):
         if self._idle_actors is None:
@@ -208,15 +208,15 @@ def get_max_input_len(self,
         return self._local_tokenizer_group.get_max_input_len(lora_request)
 
     def get_lora_tokenizer(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
 
     async def get_lora_tokenizer_async(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         return await self._local_tokenizer_group.get_lora_tokenizer_async(
             lora_request)
 
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 74c041f13bad9..a5186e48068e9 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -1,16 +1,14 @@
 from typing import List, Optional
 
-from transformers import PreTrainedTokenizer
-
 from vllm.config import TokenizerPoolConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (get_lora_tokenizer,
                                                get_lora_tokenizer_async,
                                                get_tokenizer)
-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-    BaseTokenizerGroup)
 from vllm.utils import LRUCache
 
+from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
+
 
 class TokenizerGroup(BaseTokenizerGroup):
     """A group of tokenizers that can be used for LoRA adapters."""
@@ -22,8 +20,8 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.enable_lora = enable_lora
         self.max_input_length = max_input_length
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
-        self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
-            capacity=max_num_seqs) if enable_lora else None
+        self.lora_tokenizers = LRUCache[AnyTokenizer](
+            capacity=max_num_seqs if enable_lora else 0)
 
     @classmethod
     def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
@@ -41,7 +39,7 @@ def get_max_input_len(self,
         return self.max_input_length
 
     def _raise_if_input_too_long(self,
-                                 encoded_tokens: List[str],
+                                 encoded_tokens: List[int],
                                  lora_request: Optional[LoRARequest] = None):
         input_length = len(encoded_tokens)
         if lora_request:
@@ -72,9 +70,9 @@ async def encode_async(
         return ret
 
     def get_lora_tokenizer(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         if not lora_request or not self.enable_lora:
             return self.tokenizer
         if lora_request.lora_int_id not in self.lora_tokenizers:
@@ -83,12 +81,12 @@ def get_lora_tokenizer(
             self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
             return tokenizer
         else:
-            return self.lora_tokenizers.get(lora_request.lora_int_id)
+            return self.lora_tokenizers[lora_request.lora_int_id]
 
     async def get_lora_tokenizer_async(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         if not lora_request or not self.enable_lora:
             return self.tokenizer
         if lora_request.lora_int_id not in self.lora_tokenizers:
@@ -97,4 +95,4 @@ async def get_lora_tokenizer_async(
             self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
             return tokenizer
         else:
-            return self.lora_tokenizers.get(lora_request.lora_int_id)
+            return self.lora_tokenizers[lora_request.lora_int_id]
diff --git a/vllm/utils.py b/vllm/utils.py
index 1448316e66edb..b7589ca50ba5b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -94,8 +94,10 @@ def __contains__(self, key: Hashable) -> bool:
     def __len__(self) -> int:
         return len(self.cache)
 
-    def __getitem__(self, key: Hashable) -> Optional[T]:
-        return self.get(key)
+    def __getitem__(self, key: Hashable) -> T:
+        value = self.cache[key]  # Raise KeyError if not exists
+        self.cache.move_to_end(key)
+        return value
 
     def __setitem__(self, key: Hashable, value: T) -> None:
         self.put(key, value)
@@ -109,8 +111,9 @@ def touch(self, key: Hashable) -> None:
     def get(self,
             key: Hashable,
             default_value: Optional[T] = None) -> Optional[T]:
+        value: Optional[T]
         if key in self.cache:
-            value: Optional[T] = self.cache[key]
+            value = self.cache[key]
             self.cache.move_to_end(key)
         else:
             value = default_value
@@ -590,8 +593,8 @@ def current_memory_usage(self) -> float:
             torch.cuda.reset_peak_memory_stats(self.device)
             mem = torch.cuda.max_memory_allocated(self.device)
         elif is_xpu():
-            torch.xpu.reset_peak_memory_stats(self.device)
-            mem = torch.xpu.max_memory_allocated(self.device)
+            torch.xpu.reset_peak_memory_stats(self.device)  # type: ignore
+            mem = torch.xpu.max_memory_allocated(self.device)  # type: ignore
         return mem
 
     def __enter__(self):

From f230cc2ca6614dd4eecf3af9f12c3ddbcf83036e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 31 Jul 2024 10:38:45 +0800
Subject: [PATCH 099/167] [Bugfix] Fix broadcasting logic for
 `multi_modal_kwargs` (#6836)

---
 .buildkite/test-pipeline.yaml                 |  5 +-
 .../dev/multimodal/multimodal_index.rst       |  2 +
 .../distributed/test_multimodal_broadcast.py  |  9 +-
 tests/distributed/test_parallel_state.py      | 57 -----------
 tests/models/test_llava_next.py               | 96 ++++++++++++-------
 vllm/distributed/parallel_state.py            | 46 +++------
 vllm/multimodal/__init__.py                   |  6 +-
 vllm/multimodal/base.py                       | 62 ++++++++----
 vllm/spec_decode/draft_model_runner.py        |  4 +-
 vllm/utils.py                                 | 51 +++++++++-
 vllm/worker/cpu_model_runner.py               | 27 +++---
 vllm/worker/embedding_model_runner.py         | 16 +++-
 vllm/worker/model_runner.py                   | 14 +--
 vllm/worker/neuron_model_runner.py            | 17 ++--
 vllm/worker/openvino_model_runner.py          | 26 ++---
 vllm/worker/xpu_model_runner.py               | 27 +++---
 16 files changed, 254 insertions(+), 211 deletions(-)
 delete mode 100644 tests/distributed/test_parallel_state.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 91418e5ec1752..9ec9ec12bfcfe 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -56,7 +56,6 @@ steps:
   fast_check: true
   commands:
   - pytest -v -s core
-  - pytest -v -s distributed/test_parallel_state.py
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
@@ -90,13 +89,13 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 9784f4cc2e088..f70fd03e259ff 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -44,6 +44,8 @@ Base Classes
 
 .. autodata:: vllm.multimodal.BatchedTensors
 
+.. autodata:: vllm.multimodal.BatchedTensorInputs
+
 .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
     :members:
     :show-inheritance:
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
index 8e0e8ecd675eb..a99917f586949 100644
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -19,10 +19,10 @@
 
 model = os.environ["TEST_DIST_MODEL"]
 
-if model.startswith("llava-hf/llava"):
+if model.startswith("llava-hf/llava-1.5"):
     from ..models.test_llava import models, run_test
-elif model.startswith("microsoft/Phi-3-vision"):
-    from ..models.test_phi3v import models, run_test
+elif model.startswith("llava-hf/llava-v1.6"):
+    from ..models.test_llava_next import models, run_test
 else:
     raise NotImplementedError(f"Unsupported model: {model}")
 
@@ -45,7 +45,8 @@ def test_models(hf_runner, vllm_runner, image_assets,
         vllm_runner,
         image_assets,
         model=models[0],
-        size_factors=[1.0],
+        # So that LLaVA-NeXT processor may return nested list
+        size_factors=[0.25, 0.5, 1.0],
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py
deleted file mode 100644
index 3adcf6b61046d..0000000000000
--- a/tests/distributed/test_parallel_state.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-import torch
-
-from vllm.distributed.parallel_state import (_split_tensor_dict,
-                                             _update_nested_dict)
-
-
-def test_split_tensor_dict():
-    test_dict = {
-        "key_a": "a",
-        "key_b": torch.arange(8, dtype=torch.float32),
-        "key_c": {
-            "key_1": torch.arange(5, dtype=torch.float32),
-            "key_2": torch.tensor([], dtype=torch.float32),
-            "key_3": 123,
-        },
-        "key_d": {},
-    }
-    metadata_list, tensor_list = _split_tensor_dict(test_dict)
-    assert len(metadata_list) == 6
-    assert torch.allclose(tensor_list[0], test_dict["key_b"])
-    assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
-    assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
-
-
-def test_split_tensor_dict_invalid_key():
-    test_dict = {
-        "a%b": "a",
-    }
-    with pytest.raises(AssertionError):
-        _split_tensor_dict(test_dict)
-
-
-def test_update_nested_dict():
-    flattened_keys_values = [("key1%key2%key3", "value1"),
-                             ("key1%key2%key4", "value2"),
-                             ("key1%key5", "value3"), ("key6%key7", "value4"),
-                             ("key8", "value5")]
-    res: Dict[str, Any] = {}
-
-    for flat_key, value in flattened_keys_values:
-        _update_nested_dict(res, flat_key, value)
-    assert res == {
-        "key1": {
-            "key2": {
-                "key3": "value1",
-                "key4": "value2"
-            },
-            "key5": "value3"
-        },
-        "key6": {
-            "key7": "value4"
-        },
-        "key8": "value5"
-    }
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 2f200c13ea001..9c64f39eb6d08 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,14 +1,12 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoConfig, AutoTokenizer
 
-from vllm.model_executor.models.llava_next import (
-    get_llava_next_image_feature_size)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
@@ -27,6 +25,8 @@
 
 IMAGE_TOKEN_ID = 32000
 
+models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
+
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
@@ -50,34 +50,19 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype, max_tokens, num_logprobs) -> None:
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -89,6 +74,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=4096,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True) as vllm_model:
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
@@ -122,9 +109,54 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
         )
 
 
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
 @pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
                                                          (183, 488, 776)])
 def test_image_feature_size(height_and_width_and_result):
+    # Avoid initializing CUDA too early in distributed tests
+    from vllm.model_executor.models.llava_next import (
+        get_llava_next_image_feature_size)
+
     height, width, result = height_and_width_and_result
     config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
     assert get_llava_next_image_feature_size(config,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 4116b1729d188..bf7a7de0724af 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -45,22 +45,16 @@ class GraphCaptureContext:
 
 
 def _split_tensor_dict(
-        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
-        prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
          by its metadata.
     2. A list of tensors.
-
-    If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
-    metadata will be "key1%key2".
     """
     metadata_list: List[Tuple[str, Any]] = []
-    tensor_list = []
+    tensor_list: List[torch.Tensor] = []
     for key, value in tensor_dict.items():
-        assert "%" not in key, (
-            "Avoid having '%' in key "
-            "as it is used as a separator for nested entries.")
         if isinstance(value, torch.Tensor):
             # Note: we cannot use `value.device` here,
             # because it contains not only the device type but also the device
@@ -68,31 +62,13 @@ def _split_tensor_dict(
             # receiving side will set the device index.
             device = value.device.type
             metadata_list.append(
-                (prefix + key, TensorMetadata(device, value.dtype,
-                                              value.size())))
+                (key, TensorMetadata(device, value.dtype, value.size())))
             tensor_list.append(value)
-        elif isinstance(value, dict):
-            if len(value) == 0:
-                metadata_list.append((prefix + key, value))
-            inner_metadata_list, inner_tensor_list = _split_tensor_dict(
-                value, prefix + key + "%")
-            metadata_list.extend(inner_metadata_list)
-            tensor_list.extend(inner_tensor_list)
         else:
-            metadata_list.append((prefix + key, value))
+            metadata_list.append((key, value))
     return metadata_list, tensor_list
 
 
-def _update_nested_dict(nested_dict, flattened_key, value):
-    key_splits = flattened_key.split("%")
-    cur_dict = nested_dict
-    for k in key_splits[:-1]:
-        if k not in cur_dict:
-            cur_dict[k] = {}
-        cur_dict = cur_dict[k]
-    cur_dict[key_splits[-1]] = value
-
-
 class GroupCoordinator:
     """
     PyTorch ProcessGroup wrapper for a group of processes.
@@ -566,7 +542,7 @@ def broadcast_tensor_dict(
                                          device=value.device)
                     if tensor.numel() == 0:
                         # Skip broadcasting empty tensors.
-                        _update_nested_dict(tensor_dict, key, tensor)
+                        tensor_dict[key] = tensor
                         continue
                     if tensor.is_cpu:
                         # use metadata_group for CPU tensors
@@ -583,9 +559,9 @@ def broadcast_tensor_dict(
                             group=group,
                             async_op=True)
                     async_handles.append(handle)
-                    _update_nested_dict(tensor_dict, key, tensor)
+                    tensor_dict[key] = tensor
                 else:
-                    _update_nested_dict(tensor_dict, key, value)
+                    tensor_dict[key] = value
             for async_handle in async_handles:
                 async_handle.wait()
         return tensor_dict
@@ -661,7 +637,7 @@ def recv_tensor_dict(
                                      device=value.device)
                 if tensor.numel() == 0:
                     # Skip broadcasting empty tensors.
-                    _update_nested_dict(tensor_dict, key, tensor)
+                    tensor_dict[key] = tensor
                     continue
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
@@ -673,9 +649,9 @@ def recv_tensor_dict(
                     torch.distributed.recv(tensor,
                                            src=self.ranks[src],
                                            group=group)
-                _update_nested_dict(tensor_dict, key, tensor)
+                tensor_dict[key] = tensor
             else:
-                _update_nested_dict(tensor_dict, key, value)
+                tensor_dict[key] = value
         return tensor_dict
 
     def barrier(self):
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 0e3b35d425cb7..456e41ebfad03 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,6 @@
-from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
-                   MultiModalInputs, MultiModalPlugin, NestedTensors)
+from .base import (BatchedTensorInputs, BatchedTensors, MultiModalDataBuiltins,
+                   MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
+                   NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -12,6 +13,7 @@
 """
 
 __all__ = [
+    "BatchedTensorInputs",
     "BatchedTensors",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 5abd0ad61cdf9..f13885ef0dab0 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -9,10 +9,12 @@
 import torch.types
 from PIL import Image
 from torch import nn
+from typing_extensions import TypeAlias
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
+from vllm.utils import JSONTree, json_map_leaves
 
 logger = init_logger(__name__)
 
@@ -22,11 +24,16 @@
 Currently only supports up to singly nested list of tensors.
 """
 
-BatchedTensors = Union[GenericSequence[NestedTensors], NestedTensors]
+BatchedTensors: TypeAlias = JSONTree[torch.Tensor]
 """
-If each input tensor in the batch has the same size, this is a single batched
-tensor; otherwise, this is a list of :class:`NestedTensors` with one element
-per item in the batch.
+A nested JSON structure of tensors which have been batched via
+:meth:`MultiModalInputs.batch`.
+"""
+
+BatchedTensorInputs: TypeAlias = Dict[str, JSONTree[torch.Tensor]]
+"""
+A dictionary containing nested tensors which have been batched via
+:meth:`MultiModalInputs.batch`.
 """
 
 if sys.version_info < (3, 9):
@@ -46,14 +53,17 @@ class MultiModalInputs(_MultiModalInputsBase):
     """
 
     @staticmethod
-    def try_concat(
+    def _try_concat(
         tensors: List[NestedTensors],
-        *,
-        device: torch.types.Device,
-    ) -> BatchedTensors:
+    ) -> Union[GenericSequence[NestedTensors], NestedTensors]:
+        """
+        If each input tensor in the batch has the same shape, return a single
+        batched tensor; otherwise, return a list of :class:`NestedTensors` with
+        one element per item in the batch.
+        """
         # may be list rather than tensors
         if isinstance(tensors[0], list):
-            return [[t.to(device=device) for t in tensor[0]]
+            return [[t for t in tensor[0]]
                     for tensor in cast(List[List[torch.Tensor]], tensors)]
 
         tensors_ = cast(List[torch.Tensor], tensors)
@@ -62,18 +72,21 @@ def try_concat(
 
         for tensor in tensors_:
             if tensor.shape[1:] != unbatched_shape:
-                return [
-                    tensor.squeeze(0).to(device=device) for tensor in tensors_
-                ]
+                return [tensor.squeeze(0) for tensor in tensors_]
 
-        return torch.cat(tensors_, dim=0).to(device=device)
+        return torch.cat(tensors_, dim=0)
 
     @staticmethod
-    def batch(
-        inputs_list: List["MultiModalInputs"],
-        device: torch.types.Device,
-    ) -> Dict[str, BatchedTensors]:
-        """Batch multiple inputs together into a dictionary."""
+    def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
+        """
+        Batch multiple inputs together into a dictionary.
+
+        The resulting dictionary has the same keys as the inputs.
+        If the corresponding value from each input is a tensor and they all
+        share the same shape, the output value is a single batched tensor;
+        otherwise, the output value is a list containing the original value
+        from each input.
+        """
         if len(inputs_list) == 0:
             return {}
 
@@ -90,9 +103,18 @@ def batch(
                 item_lists[k].append(v)
 
         return {
-            k: MultiModalInputs.try_concat(item_list, device=device)
+            k: MultiModalInputs._try_concat(item_list)
             for k, item_list in item_lists.items()
-        }
+        }  # type: ignore
+
+    @staticmethod
+    def as_kwargs(
+        batched_inputs: BatchedTensorInputs,
+        *,
+        device: torch.types.Device,
+    ) -> BatchedTensorInputs:
+        return json_map_leaves(lambda x: x.to(device, non_blocking=True),
+                               batched_inputs)
 
 
 class MultiModalDataBuiltins(TypedDict, total=False):
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 95071ecb6c8da..0b755600ae824 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -15,6 +15,7 @@
                          ModelConfig, MultiModalConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
+from vllm.multimodal import MultiModalInputs
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SamplerOutput)
 from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
@@ -323,7 +324,8 @@ def execute_model(
                 kv_caches=kv_caches,
                 attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
-                **multi_modal_kwargs,
+                **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                             device=self.device),
             )
 
             # Compute the logits.
diff --git a/vllm/utils.py b/vllm/utils.py
index b7589ca50ba5b..38e1782a51ab9 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -17,7 +17,7 @@
 from platform import uname
 from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
                     Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
-                    Union)
+                    Union, overload)
 
 import numpy as np
 import numpy.typing as npt
@@ -53,6 +53,7 @@
 P = ParamSpec('P')
 K = TypeVar("K")
 T = TypeVar("T")
+U = TypeVar("U")
 
 
 class _Sentinel:
@@ -712,6 +713,54 @@ def merge_dicts(dict1: Dict[K, List[T]],
     return dict(merged_dict)
 
 
+JSONTree = Union[Dict[str, "JSONTree[T]"], List["JSONTree[T]"],
+                 Tuple["JSONTree[T]", ...], T]
+"""A nested JSON structure where the leaves need not be JSON-serializable."""
+
+
+@overload
+def json_map_leaves(
+    func: Callable[[T], U],
+    value: Dict[str, JSONTree[T]],
+) -> Dict[str, JSONTree[U]]:
+    ...
+
+
+@overload
+def json_map_leaves(
+    func: Callable[[T], U],
+    value: List[JSONTree[T]],
+) -> List[JSONTree[U]]:
+    ...
+
+
+@overload
+def json_map_leaves(
+    func: Callable[[T], U],
+    value: Tuple[JSONTree[T], ...],
+) -> Tuple[JSONTree[U], ...]:
+    ...
+
+
+@overload
+def json_map_leaves(
+    func: Callable[[T], U],
+    value: JSONTree[T],
+) -> JSONTree[U]:
+    ...
+
+
+def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]:
+    if isinstance(value, dict):
+        return {k: json_map_leaves(func, v) for k, v in value.items()}
+    elif isinstance(value, list):
+        return [json_map_leaves(func, v) for v in value]
+    elif isinstance(value, tuple):
+        return tuple(json_map_leaves(func, v) for v in value)
+    else:
+        return func(value)
+
+
 def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
     """Flatten a list of lists to a single list."""
     return [item for sublist in lists for item in sublist]
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index c1dee444da512..e22e152a8a8ad 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
-                    Type, Union)
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -12,7 +11,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
@@ -41,7 +40,7 @@ class CPUModelInput(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
     virtual_engine: Optional[int] = None
 
     def as_broadcastable_tensor_dict(
@@ -136,7 +135,7 @@ def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               Mapping[str, BatchedTensors]]:
+               BatchedTensorInputs]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
@@ -214,8 +213,7 @@ def _prepare_prompt(
             slot_mapping=slot_mapping,
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
@@ -361,11 +359,16 @@ def execute_model(
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": model_input.input_tokens,
-            "positions": model_input.input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": model_input.attn_metadata,
-            **(model_input.multi_modal_kwargs or {}),
+            "input_ids":
+            model_input.input_tokens,
+            "positions":
+            model_input.input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            model_input.attn_metadata,
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
         }
 
         hidden_states = model_executable(**execute_model_kwargs)
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index e919dbd18d9df..72ab96cf3c2e1 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -8,6 +8,7 @@
                          PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.multimodal import MultiModalInputs
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
                            SequenceGroupMetadata)
@@ -99,11 +100,16 @@ def execute_model(
         kv_caches = [None] * num_layers
 
         execute_model_kwargs = {
-            "input_ids": model_input.input_tokens,
-            "positions": model_input.input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": model_input.attn_metadata,
-            **(model_input.multi_modal_kwargs or {}),
+            "input_ids":
+            model_input.input_tokens,
+            "positions":
+            model_input.input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            model_input.attn_metadata,
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
         }
 
         hidden_states = model_executable(**execute_model_kwargs)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 4010c45e10267..de999b11d91b5 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -4,8 +4,8 @@
 import warnings
 import weakref
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set,
-                    Tuple, Type, TypeVar, Union)
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -40,7 +40,7 @@
 from vllm.model_executor.models.interfaces import (supports_lora,
                                                    supports_vision)
 from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.prompt_adapter.layers import PromptAdapterMapping
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -94,7 +94,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
     attn_metadata: Optional["AttentionMetadata"] = None
     prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
     prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
-    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
     request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
     finished_requests_ids: Optional[List[str]] = None
     virtual_engine: int = 0
@@ -608,8 +608,7 @@ def build(self) -> ModelInputForGPU:
             data.multi_modal_inputs for data in self.inter_data_list
             if data.multi_modal_inputs is not None
         ]
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.runner.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return self.model_input_cls(
             input_tokens=input_tokens_tensor,
@@ -1361,7 +1360,8 @@ def execute_model(
             kv_caches=kv_caches,
             attn_metadata=model_input.attn_metadata,
             intermediate_tensors=intermediate_tensors,
-            **multi_modal_kwargs,
+            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                         device=self.device),
             **seqlen_agnostic_kwargs)
 
         # Compute the logits in the last pipeline stage.
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 243e2ece56fe5..6448e5ff4ac5e 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
-                    Union)
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -10,7 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
@@ -32,7 +31,7 @@ class ModelInputForNeuron(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     input_block_ids: Optional[torch.Tensor] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -84,8 +83,8 @@ def load_model(self) -> None:
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], Mapping[
-            str, BatchedTensors]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
+               BatchedTensorInputs]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
@@ -134,8 +133,7 @@ def _prepare_prompt(
                                        dtype=torch.long,
                                        device=self.device)
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return (input_tokens, input_positions, input_block_ids, seq_lens,
                 multi_modal_kwargs)
@@ -244,7 +242,8 @@ def execute_model(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
             input_block_ids=model_input.input_block_ids,
-            **(model_input.multi_modal_kwargs or {}),
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
         )
 
         # Compute the logits.
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 6281cec09825f..a1d09a2f9e53e 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,4 +1,4 @@
-from typing import List, Mapping, NamedTuple, Optional, Tuple
+from typing import List, NamedTuple, Optional, Tuple
 
 import openvino as ov
 import torch
@@ -12,7 +12,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.openvino import get_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 
@@ -25,7 +25,7 @@ class ModelInput(NamedTuple):
     attn_metadata: Optional[OpenVINOAttentionMetadata]
     seq_lens: List[int]
     query_lens: List[int]
-    multi_modal_kwargs: Mapping[str, BatchedTensors]
+    multi_modal_kwargs: BatchedTensorInputs
 
     @classmethod
     def empty(cls, device):
@@ -265,8 +265,7 @@ def _prepare_model_input(
             max_context_len=max_context_len_tensor,
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return ModelInput(
             input_tokens,
@@ -281,7 +280,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
-               SamplingMetadata, Mapping[str, BatchedTensors]]:
+               SamplingMetadata, BatchedTensorInputs]:
         # Prepare input tensors.
         (
             input_tokens,
@@ -324,11 +323,16 @@ def execute_model(
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
-            **(multi_modal_kwargs or {}),
+            "input_ids":
+            input_tokens,
+            "positions":
+            input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            attn_metadata,
+            **MultiModalInputs.as_kwargs(multi_modal_kwargs or {},
+                                         device=self.device),
         }
 
         hidden_states = model_executable(**execute_model_kwargs)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 98462f0f7f38e..112e494fadede 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
-                    Type, Union)
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -14,7 +13,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models.interfaces import supports_vision
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
@@ -49,7 +48,7 @@ class ModelInputForXPU(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -376,11 +375,16 @@ def execute_model(
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": model_input.input_tokens,
-            "positions": model_input.input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": model_input.attn_metadata,
-            **(model_input.multi_modal_kwargs or {}),
+            "input_ids":
+            model_input.input_tokens,
+            "positions":
+            model_input.input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            model_input.attn_metadata,
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
         }
 
         hidden_states = model_executable(**execute_model_kwargs)
@@ -404,7 +408,7 @@ def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               Mapping[str, BatchedTensors]]:
+               BatchedTensorInputs]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
@@ -496,8 +500,7 @@ def _prepare_prompt(
             block_tables=torch.tensor([], device=self.device, dtype=torch.int),
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)

From 9f0e69b65350fad1d4a9c71ef58d6ae70eb635e8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 31 Jul 2024 10:49:48 +0800
Subject: [PATCH 100/167] [CI/Build] Fix mypy errors (#6968)

---
 vllm/_custom_ops.py     | 4 ++--
 vllm/multimodal/base.py | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e351d602189e2..2c09ca2c1407c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,6 +1,6 @@
 import contextlib
 import functools
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Union
 
 import torch
 
@@ -336,7 +336,7 @@ def scaled_fp8_quant(
     """
     # This code assumes batch_dim and num_tokens are flattened
     assert (input.ndim == 2)
-    shape = input.shape
+    shape: Union[Tuple[int, int], torch.Size] = input.shape
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
     output = torch.empty(shape, device=input.device, dtype=torch.float8_e4m3fn)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index f13885ef0dab0..aefb5f438c5ad 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -53,9 +53,7 @@ class MultiModalInputs(_MultiModalInputsBase):
     """
 
     @staticmethod
-    def _try_concat(
-        tensors: List[NestedTensors],
-    ) -> Union[GenericSequence[NestedTensors], NestedTensors]:
+    def _try_concat(tensors: List[NestedTensors]) -> BatchedTensors:
         """
         If each input tensor in the batch has the same shape, return a single
         batched tensor; otherwise, return a list of :class:`NestedTensors` with
@@ -105,7 +103,7 @@ def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
         return {
             k: MultiModalInputs._try_concat(item_list)
             for k, item_list in item_lists.items()
-        }  # type: ignore
+        }
 
     @staticmethod
     def as_kwargs(

From 533d1932d23917f7a29ba04c6be5c8b2676b6969 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 31 Jul 2024 00:19:28 -0700
Subject: [PATCH 101/167] [Bugfix][TPU] Set readonly=True for non-root devices
 (#6980)

---
 vllm/worker/tpu_worker.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 17fa5c35457c2..90c86d4e6c59d 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -104,7 +104,10 @@ def init_device(self) -> None:
         # Use persistent cache to avoid XLA recompilation.
         # NOTE(woosuk): This does not completely eliminate the recompilation
         # overhead because dynamo does not cache the compiled results.
-        xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False)
+        # NOTE(woosuk): Set readonly=False only for the rank 0 process to avoid
+        # race conditions.
+        xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH,
+                            readonly=not self.is_driver_worker)
 
     def load_model(self):
         self.model_runner.load_model()

From c0644cf9cef0002485749defcaa02e3fec359d49 Mon Sep 17 00:00:00 2001
From: Fei <dfdfcai4@gmail.com>
Date: Wed, 31 Jul 2024 01:16:01 -0700
Subject: [PATCH 102/167] [Bugfix] fix logit processor excceed vocab size issue
 (#6927)

---
 vllm/entrypoints/openai/logits_processors.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 31eb5aa628c52..f8e04e7f18e0f 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -58,6 +58,12 @@ def get_logits_processors(
                 "Found token_id in logit_bias that is not "
                 "an integer or string representing an integer") from exc
 
+        # Check if token_id is within the vocab size
+        for token_id, bias in clamped_logit_bias.items():
+            if token_id < 0 or token_id >= tokenizer.vocab_size:
+                raise ValueError("token_id in logit_bias contains "
+                                 "out-of-vocab token id")
+
         def logit_bias_logits_processor(token_ids: List[int],
                                         logits: torch.Tensor) -> torch.Tensor:
             for token_id, bias in clamped_logit_bias.items():

From 6512937de1d7b4738938e0bb3004be86b6883729 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Wed, 31 Jul 2024 21:55:21 +0800
Subject: [PATCH 103/167] Support W4A8 quantization for vllm (#5218)

---
 .../configs/Meta-Llama-3-8B-QQQ.yaml          |   11 +
 .../lm-eval-harness/configs/models-small.txt  |    1 +
 CMakeLists.txt                                |    1 +
 csrc/ops.h                                    |    7 +
 csrc/quantization/marlin/dense/common/base.h  |   32 +
 csrc/quantization/marlin/dense/common/mem.h   |   89 ++
 .../marlin/dense/marlin_cuda_kernel.cu        |   90 +-
 .../marlin/qqq/marlin_qqq_gemm_kernel.cu      | 1243 +++++++++++++++++
 csrc/torch_bindings.cpp                       |    4 +
 tests/kernels/test_marlin_gemm.py             |   66 +
 vllm/_custom_ops.py                           |    9 +
 .../layers/quantization/__init__.py           |    2 +
 .../model_executor/layers/quantization/qqq.py |  285 ++++
 .../utils/marlin_utils_test_qqq.py            |  125 ++
 .../layers/quantization/utils/quant_utils.py  |   82 ++
 15 files changed, 1963 insertions(+), 84 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
 create mode 100644 csrc/quantization/marlin/dense/common/base.h
 create mode 100644 csrc/quantization/marlin/dense/common/mem.h
 create mode 100644 csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
 create mode 100644 vllm/model_executor/layers/quantization/qqq.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
new file mode 100644
index 0000000000000..c457468902c98
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.409
+  - name: "exact_match,flexible-extract"
+    value: 0.406
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index e4df4b547aa5e..bca89f00653e3 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -7,3 +7,4 @@ Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Minitron-4B-Base.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
+Meta-Llama-3-8B-QQQ.yaml
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf00a36edc500..28b8879a7ba17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,6 +170,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
     "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+    "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
     "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
diff --git a/csrc/ops.h b/csrc/ops.h
index f075850248d1c..f274a7e647b95 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -115,6 +115,13 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b_scales,
                        c10::optional<torch::Tensor> const& bias);
 
+torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
+                              torch::Tensor const& b_q_weight,
+                              torch::Tensor const& s_tok,
+                              torch::Tensor const& s_ch,
+                              torch::Tensor const& s_group,
+                              torch::Tensor& workspace, int64_t size_m,
+                              int64_t size_n, int64_t size_k);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/marlin/dense/common/base.h b/csrc/quantization/marlin/dense/common/base.h
new file mode 100644
index 0000000000000..68c83d5478cf8
--- /dev/null
+++ b/csrc/quantization/marlin/dense/common/base.h
@@ -0,0 +1,32 @@
+/*
+ * Modified by HandH1998
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
diff --git a/csrc/quantization/marlin/dense/common/mem.h b/csrc/quantization/marlin/dense/common/mem.h
new file mode 100644
index 0000000000000..64f9c393d77ce
--- /dev/null
+++ b/csrc/quantization/marlin/dense/common/mem.h
@@ -0,0 +1,89 @@
+/*
+ * Modified by HandH1998
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index efbcc182a3ae4..1ce734c9d90de 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -25,6 +25,12 @@
 
 #include <iostream>
 
+#include "common/base.h"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #include "common/mem.h"
+#endif
+
 template <typename T>
 inline std::string str(T x) {
   return std::to_string(x);
@@ -32,23 +38,9 @@ inline std::string str(T x) {
 
 namespace marlin_dense {
 
-constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
-
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 
-// Instances of `Vec` are used to organize groups of >>registers<<, as needed
-// for instance as inputs to tensor core operations. Consequently, all
-// corresponding index accesses must be compile-time constants, which is why we
-// extensively use `#pragma unroll` throughout the kernel code to guarantee
-// this.
-template <typename T, int n>
-struct Vec {
-  T elems[n];
-  __device__ T& operator[](int i) { return elems[i]; }
-};
-
 using I4 = Vec<int, 4>;
-
 // Matrix fragments for tensor core instructions; their precise layout is
 // documented here:
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
@@ -57,43 +49,6 @@ using FragB = Vec<half2, 2>;
 using FragC = Vec<float, 4>;
 using FragS = Vec<half2, 1>;  // quantization scales
 
-// Predicated asynchronous global->shared copy; used for inputs A where we apply
-// predication to handle batchsizes that are not multiples of 16.
-__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
-                                      bool pred = true) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   .reg .pred p;\n"
-      "   setp.ne.b32 p, %0, 0;\n"
-      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-      "}\n" ::"r"((int)pred),
-      "r"(smem), "l"(glob_ptr), "n"(BYTES));
-}
-
-// Asynchronous global->shared copy
-__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
-}
-
-// Async copy fence.
-__device__ inline void cp_async_fence() {
-  asm volatile("cp.async.commit_group;\n" ::);
-}
-
-// Wait until at most `n` async copy stages are still pending.
-template <int n>
-__device__ inline void cp_async_wait() {
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
-}
-
 // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
 __device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
@@ -164,39 +119,6 @@ __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
-// Wait until barrier reaches `count`, then lock for current threadblock.
-__device__ inline void barrier_acquire(int* lock, int count) {
-  if (threadIdx.x == 0) {
-    int state = -1;
-    do
-      // Guarantee that subsequent writes by this threadblock will be visible
-      // globally.
-      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
-                   : "=r"(state)
-                   : "l"(lock));
-    while (state != count);
-  }
-  __syncthreads();
-}
-
-// Release barrier and increment visitation count.
-__device__ inline void barrier_release(int* lock, bool reset = false) {
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    if (reset) {
-      lock[0] = 0;
-      return;
-    }
-    int val = 1;
-    // Make sure that all writes since acquiring this barrier are visible
-    // globally, while releasing the barrier.
-    asm volatile("fence.acq_rel.gpu;\n");
-    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
-                 :
-                 : "l"(lock), "r"(val));
-  }
-}
-
 template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
new file mode 100644
index 0000000000000..4162a38af1035
--- /dev/null
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -0,0 +1,1243 @@
+/*
+ * Adapted from
+ * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu
+ * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp
+ * Modified by HandH1998
+ * Copyright (C) 2024 HandH1998
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#include "../dense/common/base.h"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #include "../dense/common/mem.h"
+#endif
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+
+using I4 = Vec<int, 4>;
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type
+using FragA = Vec<uint32_t, 2>;
+using FragB = Vec<uint32_t, 1>;
+using FragC = Vec<int, 4>;
+using FragS_GROUP = Vec<half2, 1>;  // weight per-group quantization scales
+using FragS_CHANNEL =
+    Vec<float, 2>;  // weight per-channel quantization scales or activaton
+                    // per-token quantization scales
+
+// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however,
+// cp.async.ca can support BYTES = 4, 8, 16;
+// as s_tok's shape is equal to prob_m, we need set s_tok to float type,
+// and cp_size = 1 float, i.e., 4 BYTES
+// Asynchronous global->shared copy for activation quantizaton scales s_tok
+__device__ inline void cp_async1(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 4;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.ca.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// m16n8k16 tensor core mma instruction with int8 inputs and int32
+// output/accumulation.
+__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
+                           FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  int* c = reinterpret_cast<int*>(&frag_c);
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 "
+      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+      : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+        "r"(c[3]));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in int8 tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+               : "=r"(a[0]), "=r"(a[1])
+               : "r"(smem));
+}
+
+inline __device__ half2 float2_to_half2(float2 f) {
+  uint32_t res;
+  // NOTE(HandH1998): h0,h1 should be uint16_t, not half
+  uint16_t h0, h1;
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h0) : "f"(f.x));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h1) : "f"(f.y));
+  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(res) : "h"(h0), "h"(h1));
+  return reinterpret_cast<half2&>(res);
+}
+
+inline __device__ float int32_to_float(int h) {
+  float res;
+  asm volatile("cvt.rn.f32.s32 %0, %1;\n" : "=f"(res) : "r"(h));
+  return res;
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
+// for weight per channel dequant.
+__device__ inline FragB dequant_per_channel(int q) {
+  static constexpr int MASK = 0xf0f0f0f0;
+  FragB frag_b;
+  frag_b[0] = (q & MASK);
+  return frag_b;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
+// for weight per group dequant.
+__device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
+  static constexpr uint32_t LO = 0x000f000f;
+  static constexpr uint32_t HI = 0x00f000f0;
+  static constexpr uint32_t EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  static constexpr uint32_t SUB = 0x64086408;
+  static constexpr uint32_t MUL = 0x2c002c00;
+  static constexpr uint32_t ADD = 0xd480d480;
+  *reinterpret_cast<half2*>(&t0) = __hsub2(
+      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<const half2*>(&SUB));
+  *reinterpret_cast<half2*>(&t1) = __hfma2(
+      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<const half2*>(&MUL),
+      *reinterpret_cast<const half2*>(&ADD));
+
+  uint16_t s = reinterpret_cast<uint16_t*>(&frag_s)[i];
+  uint32_t double_s;
+  // pack 2xfp16 to half2
+  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(double_s) : "h"(s), "h"(s));
+  // dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4
+  // half, respectively)
+  static constexpr uint32_t MAGIC_NUM = 0x64806480;
+  *reinterpret_cast<half2*>(&t0) = __hfma2(
+      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<half2*>(&double_s),
+      *reinterpret_cast<const half2*>(&MAGIC_NUM));
+  *reinterpret_cast<half2*>(&t1) = __hfma2(
+      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<half2*>(&double_s),
+      *reinterpret_cast<const half2*>(&MAGIC_NUM));
+  // take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4
+  // int8 into 1 uint32
+  FragB frag_b;
+  uint32_t uint8s;
+  static constexpr uint32_t MASK_0246 = 0x6420;
+  static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080;
+  asm volatile("prmt.b32 %0,%1,%2,%3;\n"
+               : "=r"(uint8s)
+               : "r"(t0), "r"(t1), "n"(MASK_0246));
+  frag_b[0] = (uint8s ^ UINT8s_TO_INT8s_MASK);
+  return frag_b;
+}
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // int8 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // int32 global_reduce buffer of shape
+                           // (max_par*16*4)xn, as int8 tensor core's output is
+                           // int32 dtype
+    int4* __restrict__ D,              // fp16 output buffer of shape mxn
+    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
+                                       // scales of shape mx1
+    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
+                                       // scales of shape 1xn
+    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
+                                       // scales of shape (k/groupsize)xn, when
+                                       // group_blocks=-1, it should be nullptr
+    int prob_m,                        // batch dimension m
+    int prob_n,                        // output dimension n
+    int prob_k,                        // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if constexpr (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 16;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 4;
+    D += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    s_tok += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 16;
+      C += 16 * thread_m_blocks * prob_n / 4;
+      D += 16 * thread_m_blocks * prob_n / 8;
+      s_tok += 16 * thread_m_blocks;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  int a_gl_stride = prob_k / 16;  // stride of the A matrix in global memory
+  // We typically use `constexpr` to indicate that this value is a compile-time
+  // constant
+  constexpr int a_sh_stride =
+      16 * thread_k_blocks / 16;  // stride of an A matrix tile in shared memory
+  constexpr int a_gl_rd_delta_o =
+      16 * thread_k_blocks /
+      16;  // delta between subsequent A tiles in global memory
+  int a_gl_rd_delta_i =
+      a_gl_stride *
+      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
+  constexpr int a_sh_wr_delta =
+      a_sh_stride *
+      (threads / a_gl_rd_delta_o);  // between shared memory writes
+  constexpr int a_sh_rd_delta_o =
+      1 * ((threads / 32) /
+           (thread_n_blocks / 4));  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_i =
+      a_sh_stride * 16;  // within a shared memory tile
+  constexpr int a_sh_stage =
+      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
+  constexpr int a_sh_wr_iters =
+      ceildiv(a_sh_stage,
+              a_sh_wr_delta);  // number of shared write iterations for a tile
+
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  constexpr int s_tok_sh_stride = 16 * thread_m_blocks;
+
+  constexpr int s_ch_sh_stride = 16 * thread_n_blocks / 4;
+
+  int s_group_gl_stride = prob_n / 8;
+  constexpr int s_group_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_group_sh_stage = s_group_sh_stride;
+  int s_group_gl_rd_delta = s_group_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  // NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix
+  int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % 16);
+  a_sh_rd += 1 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
+
+  int s_tok_gl_rd = threadIdx.x;
+  // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
+  // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
+  // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
+  // s_tok's size is not fixed, we can not shuffle before inference we shuffle
+  // it when fetching s_tok from global memory to shared memory, that's why
+  // s_tok_sh_wr is like this
+  int s_tok_sh_wr =
+      (threadIdx.x / 16) * 16 + (threadIdx.x % 8) * 2 + (threadIdx.x % 16) / 8;
+  int s_tok_sh_rd = (threadIdx.x % 32) / 4;
+  bool s_tok_sh_wr_pred = threadIdx.x < prob_m;
+
+  int s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+  int s_ch_sh_wr = threadIdx.x;
+  int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   2 * ((threadIdx.x % 32) % 4);
+  bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
+
+  int s_group_gl_rd, s_group_sh_wr, s_group_sh_rd;
+  bool s_group_sh_wr_pred;
+  if constexpr (group_blocks != -1) {
+    s_group_gl_rd =
+        s_group_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+        s_group_sh_stride * slice_col + threadIdx.x;
+    s_group_sh_wr = threadIdx.x;
+    // NOTE(HandH1998): s_group_sh_rd is related to mma output C
+    s_group_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                    (threadIdx.x % 32) / 4;
+    s_group_sh_wr_pred = threadIdx.x < s_group_sh_stride;
+  }
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  // NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages *
+  // a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage)
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_s_tok = sh_b + (stages * b_sh_stage);
+  int4* sh_s_ch = sh_s_tok + s_tok_sh_stride;
+  int4* sh_s_group = sh_s_ch + s_ch_sh_stride;
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS_GROUP frag_s_group[2][4];
+  FragS_CHANNEL frag_s_tok[thread_m_blocks];
+  FragS_CHANNEL frag_s_ch[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<int*>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if constexpr (group_blocks != -1) {
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_group_stage = sh_s_group + s_group_sh_stage * pipe;
+          if (s_group_sh_wr_pred)
+            cp_async4(&sh_s_group_stage[s_group_sh_wr],
+                      &s_group[s_group_gl_rd]);
+          s_group_gl_rd += s_group_gl_rd_delta;
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticeable drop in performance.
+    if constexpr (group_blocks != -1) {
+      int4* sh_s_group_stage =
+          sh_s_group +
+          s_group_sh_stage * ((group_blocks / thread_k_blocks) *
+                              (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4*>(&frag_s_group[k % 2])[0] =
+          sh_s_group_stage[s_group_sh_rd];
+    }
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      // int b_quant_shift = b_quant << 4;
+      FragB frag_b0, frag_b1;
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if constexpr (group_blocks != -1) {
+        int b_quant_shift = b_quant >> 8;
+        frag_b0 = dequant_per_group(b_quant, frag_s_group[k % 2][j], 0);
+        frag_b1 = dequant_per_group(b_quant_shift, frag_s_group[k % 2][j], 1);
+      } else {
+        int b_quant_shift = b_quant << 4;
+        frag_b0 = dequant_per_channel(b_quant);
+        frag_b1 = dequant_per_channel(b_quant_shift);
+      }
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                int* c_rd =
+                    reinterpret_cast<int*>(&sh[red_sh_delta * j + red_sh_rd]);
+                int* c_wr = reinterpret_cast<int*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            int* c_rd =
+                reinterpret_cast<int*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  // global_reduce works on INT32 elements, which are the results of INT8 GEMM.
+  // This is why we need another INT32 maxtrix `C` to reduce instead of the
+  // original half matrix `D`.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 4;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 8 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
+      c_gl_wr += (4 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads * 2;
+      int c_sh_wr = 2 * threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2)],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i + 1],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2) + 1],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 d_red1 = sh[c_sh_wr + i * c_sh_wr_delta];
+            int4 d_red2 = sh[c_sh_wr + i * c_sh_wr_delta + 1];
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  reinterpret_cast<int*>(&d_red1)[j];
+            }
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)] +=
+                  reinterpret_cast<int*>(&d_red2)[j];
+            }
+          }
+          if (!last) {
+            int4 d1, d2;
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(&d1)[j] = reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)];
+            }
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(&d2)[j] = reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)];
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                d1;
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2) +
+              1] = d2;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int d_gl_stride = prob_n / 8;
+    constexpr int d_sh_stride = 2 * thread_n_blocks + 1;
+    int d_gl_wr_delta = d_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int d_sh_rd_delta =
+        d_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int d_gl_wr = d_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    d_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int d_sh_wr =
+        (4 * d_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    d_sh_wr += 32 * (threadIdx.x / 32);
+    int d_sh_rd = d_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int d_gl_wr_end = d_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, int c0, int c1, float a_s, FragS_CHANNEL& w_s) {
+      float2 deq_res;
+      deq_res.x = int32_to_float(c0) * w_s[0] * a_s;
+      deq_res.y = int32_to_float(c1) * w_s[1] * a_s;
+      ((half2*)sh)[idx] = float2_to_half2(deq_res);
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = d_sh_wr + 8 * j;
+          write(wr + (4 * d_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s_tok[i][0],
+                frag_s_ch[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * d_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s_tok[i][1],
+                frag_s_ch[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * d_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s_tok[i][0],
+                frag_s_ch[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * d_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s_tok[i][1],
+                frag_s_ch[j / 2][2 * (j % 2) + 1]);
+        }
+        d_sh_wr += 16 * (4 * d_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (d_gl_wr < d_gl_wr_end) {
+        D[d_gl_wr] = sh[d_sh_rd];
+        d_gl_wr += d_gl_wr_delta;
+        d_sh_rd += d_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+  // We unroll over both the global fetch and the register load pipeline to
+  // ensure all shared memory accesses are static. Note that both pipelines have
+  // even length meaning that the next iteration will always start at index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (last) {
+        if (s_tok_sh_wr_pred) {
+          cp_async1(&sh_s_tok[s_tok_sh_wr], &s_tok[s_tok_gl_rd]);
+        }
+        if (s_ch_sh_wr_pred) {
+          cp_async4(&sh_s_ch[s_ch_sh_wr], &s_ch[s_ch_gl_rd]);
+        }
+        cp_async_fence();
+      }
+      thread_block_reduce();
+      if (last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+            frag_s_tok[i][0] =
+                *reinterpret_cast<float*>(&sh_s_tok[16 * i + 2 * s_tok_sh_rd]);
+            frag_s_tok[i][1] = *reinterpret_cast<float*>(
+                &sh_s_tok[16 * i + 2 * s_tok_sh_rd + 1]);
+          }
+          reinterpret_cast<int4*>(&frag_s_ch)[0] = sh_s_ch[s_ch_sh_rd + 0];
+          reinterpret_cast<int4*>(&frag_s_ch)[1] = sh_s_ch[s_ch_sh_rd + 1];
+          reinterpret_cast<int4*>(&frag_s_ch)[2] = sh_s_ch[s_ch_sh_rd + 8];
+          reinterpret_cast<int4*>(&frag_s_ch)[3] = sh_s_ch[s_ch_sh_rd + 9];
+        }
+      }
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+        s_group_gl_rd = s_group_sh_stride * slice_col + threadIdx.x;
+        s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#else
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // int8 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // int32 global_reduce buffer of shape
+                           // (max_par*16*4)xn, as int8 tensor core's output is
+                           // int32 dtype
+    int4* __restrict__ D,              // fp16 output buffer of shape mxn
+    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
+                                       // scales of shape mx1
+    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
+                                       // scales of shape 1xn
+    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
+                                       // scales of shape (k/groupsize)xn, when
+                                       // group_blocks=-1, it should be nullptr
+    int prob_m,                        // batch dimension m
+    int prob_n,                        // output dimension n
+    int prob_k,                        // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+#endif
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;               // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4;  // 4 pipeline stages fit into shared memory
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+static constexpr int pack_factor_4bit =
+    8;  // We have 8 4-bit vals inside a 32 bit
+
+#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+                  GROUP_BLOCKS, NUM_THREADS)                                   \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
+    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
+                         max_shared_mem);                                      \
+    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
+           STAGES, GROUP_BLOCKS>                                               \
+        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
+            A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr,      \
+            prob_m, prob_n, prob_k, locks);                                    \
+  }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},  // Default
+    {128, 64, 128},   // Reduce N 2X, same K
+    {64, 256, 256},   // Reduce K 2X, increase N 2X
+    {64, 128, 128},   // Reduce K 2X, same N
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},   // Default
+    {128, 128, 256},  // Reduce N 2X, increase K 2X
+    {64, 128, 128},   // Reduce N 2X, same K
+    {128, 64, 128},   // Reduce N 4X, increase K 2X
+};
+
+bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  return true;
+}
+
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+  if (prob_m <= 16) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+
+  return thread_config_t{-1, -1, -1};
+}
+
+#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
+
+void marlin_qqq_cuda(const void* A, const void* B, void* C, void* D,
+                     void* s_tok, void* s_ch, void* s_group, int prob_m,
+                     int prob_n, int prob_k, void* workspace,
+                     int groupsize = -1, int dev = 0, cudaStream_t stream = 0,
+                     int thread_k = -1, int thread_n = -1, int sms = -1,
+                     int max_par = 16) {
+  int tot_m = prob_m;
+  int tot_m_blocks = ceildiv(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1)
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+
+  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+    throw std::runtime_error(
+        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+        ", thread_n = " + str(th_config.thread_n) +
+        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
+        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
+  }
+
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
+    return;
+  }
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* D_ptr = (int4*)D;
+  const float* s_tok_ptr = (const float*)s_tok;
+  const int4* s_ch_ptr = (const int4*)s_ch;
+  const int4* s_group_ptr = (const int4*)s_group;
+
+  int* locks = (int*)workspace;
+
+  for (int i = 0; i < tot_m_blocks; i += 4) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / 64;
+      if (par > max_par) par = max_par;
+      prob_m = 64 * par;
+      i += 4 * (par - 1);
+      thread_m_blocks = 4;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+    if (false) {
+    }
+    CALL_IF(8, 8, 256)
+    CALL_IF(16, 4, 256)
+    CALL_IF(8, 4, 128)
+    CALL_IF(4, 8, 128)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 16) * par;
+    D_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+    s_tok_ptr += 16 * thread_m_blocks * par;
+  }
+}
+}  // anonymous namespace
+
+torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
+                              torch::Tensor const& b_q_weight,
+                              torch::Tensor const& s_tok,
+                              torch::Tensor const& s_ch,
+                              torch::Tensor const& s_group,
+                              torch::Tensor& workspace, int64_t size_m,
+                              int64_t size_n, int64_t size_k) {
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+  TORCH_CHECK(size_m == s_tok.numel(),
+              "Shape mismatch: s_tok.numel() = " + str(s_tok.numel()) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % tile_size == 0,
+              "size_k = " + str(size_k) +
+                  " is not divisible by tile_size = " + str(tile_size));
+  TORCH_CHECK(
+      (size_k / tile_size) == b_q_weight.size(0),
+      "Shape mismatch: b_q_weight.size(0) = " + str(b_q_weight.size(0)) +
+          ", size_k = " + str(size_k) + ", tile_size = " + str(tile_size));
+
+  int groupsize = (s_group.numel() == 0) ? -1 : size_k / s_group.size(0);
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 128,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify N
+  TORCH_CHECK(s_ch.numel() == size_n,
+              "Shape mismatch: s_ch.numel() = " + str(s_ch.numel()) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(b_q_weight.size(1) % tile_size == 0,
+              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+                  " is not divisible by tile_size = " + str(tile_size));
+  if (groupsize != -1) {
+    TORCH_CHECK(s_group.size(1) == size_n,
+                "Shape mismatch: s_group.size(1) = " + str(s_group.size(1)) +
+                    ", size_n = " + str(size_n));
+    TORCH_CHECK(
+        size_k % s_group.size(0) == 0,
+        "size_k = " + str(size_k) +
+            ", is not divisible by s_group.size(0) = " + str(s_group.size(0)));
+  }
+
+  int actual_size_n = (b_q_weight.size(1) / tile_size) * pack_factor_4bit;
+  TORCH_CHECK(size_n == actual_size_n,
+              "Shape mismatch: size_n = " + str(size_n) +
+                  ", actual_size_n = " + str(actual_size_n));
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify s_tok device, strides and dtype
+  TORCH_CHECK(s_tok.device().is_cuda(), "s_tok is not on GPU");
+  TORCH_CHECK(s_tok.is_contiguous(), "s_tok is not contiguous");
+  TORCH_CHECK(s_tok.dtype() == torch::kFloat32, "s_tok's dtype is not float32");
+
+  // Verify s_ch device, strides and dtype
+  TORCH_CHECK(s_ch.device().is_cuda(), "s_ch is not on GPU");
+  TORCH_CHECK(s_ch.is_contiguous(), "s_ch is not contiguous");
+  TORCH_CHECK(s_ch.dtype() == torch::kFloat32, "s_ch's dtype is not float32");
+
+  // Verify s_group device, strides and dtype
+  TORCH_CHECK(s_group.device().is_cuda(), "s_group is not on GPU");
+  TORCH_CHECK(s_group.is_contiguous(), "s_group is not contiguous");
+  TORCH_CHECK(s_group.dtype() == torch::kFloat16,
+              "s_group's dtype is not float16");
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " + str(min_thread_n));
+  int min_workspace_size = (size_n / min_thread_n) * max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options_c = torch::TensorOptions().dtype(torch::kInt).device(a.device());
+  torch::Tensor c = torch::empty({max_par * 64, size_n}, options_c);
+
+  // Alloc D matrix
+  auto options_d =
+      torch::TensorOptions().dtype(torch::kFloat16).device(a.device());
+  torch::Tensor d = torch::empty({size_m, size_n}, options_d);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  int dev = a.get_device();
+  marlin_qqq_cuda(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), d.data_ptr(),
+      s_tok.data_ptr(), s_ch.data_ptr(), s_group.data_ptr(), size_m, size_n,
+      size_k, workspace.data_ptr(), groupsize, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par);
+
+  return d;
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 3027b63ba2b33..bf8cefa8d4713 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -149,6 +149,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
   ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
 
+  // marlin_qqq_gemm for QQQ.
+  ops.def("marlin_qqq_gemm", &marlin_qqq_gemm);
+  ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
+
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization.
   ops.def(
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index bd35ef2eb2552..a9e34ac8a7aa8 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -10,6 +10,9 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.qqq import (
+    MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
+    MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
     MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS,
@@ -21,6 +24,8 @@
     marlin_weights)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
     marlin_24_quantize)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
+    marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     awq_pack, gptq_pack, quantize_weights, quantize_weights_with_zp,
     sort_weights)
@@ -425,3 +430,64 @@ def test_awq_marlin_gemm(
     print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
+
+
+@pytest.mark.skipif(not is_quant_method_supported("qqq"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_marlin_qqq_gemm(
+    k_chunk,
+    n_chunk,
+    num_bits,
+    group_size,
+    mnk_factors,
+):
+    int8_traits = torch.iinfo(torch.int8)
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    print(f"MNK = {size_m} {size_n} {size_k}")
+    print(f"groupsize = {group_size}")
+
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+
+    # Quantize activations
+    s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
+        torch.float)
+    q_a = (a_input / s_a).round().clamp(int8_traits.min,
+                                        int8_traits.max).to(torch.int8)
+
+    # Quantize weights
+    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
+    marlin_qqq_quantize(b_weight, num_bits, group_size)
+
+    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
+                                MARLIN_QQQ_MAX_PARALLEL)
+
+    output = ops.marlin_qqq_gemm(
+        q_a,
+        marlin_qqq_q_w,
+        s_a,
+        marlin_qqq_s_channel,
+        marlin_qqq_s_group,
+        workspace.scratch,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+    )
+    output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+    print("max_diff = {}".format(max_diff))
+
+    assert max_diff < 0.04
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2c09ca2c1407c..9e09b9a32eab9 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -389,6 +389,15 @@ def scaled_int8_quant(
     return output, input_scales
 
 
+# qqq ops
+def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    s_tok: torch.Tensor, s_ch: torch.Tensor,
+                    s_group: torch.Tensor, workspace: torch.Tensor,
+                    size_m: int, size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group,
+                                        workspace, size_m, size_n, size_k)
+
+
 # moe
 def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                          block_size: int, sorted_token_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index bd574512e3431..13da6376ec295 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -19,6 +19,7 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQMarlin24Config)
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
+from vllm.model_executor.layers.quantization.qqq import QQQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 
 QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
@@ -37,6 +38,7 @@
     "squeezellm": SqueezeLLMConfig,
     "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
+    "qqq": QQQConfig,
 }
 
 
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
new file mode 100644
index 0000000000000..be10cee2cf68f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -0,0 +1,285 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+MARLIN_QQQ_TILE = 16
+MARLIN_QQQ_MIN_THREAD_N = 64
+MARLIN_QQQ_MIN_THREAD_K = 128
+MARLIN_QQQ_MAX_PARALLEL = 16
+
+MARLIN_QQQ_SUPPORTED_NUM_BITS = [4]
+MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128]
+MARLIN_QQQ_SUPPORTED_SYM = [True]
+
+
+class QQQConfig(QuantizationConfig):
+    """Config class for QQQ
+    
+    Reference: https://arxiv.org/pdf/2406.09904
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        is_sym: bool = True,
+    ) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.is_sym = is_sym
+
+        # Verify
+        if self.weight_bits not in MARLIN_QQQ_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"QQQ does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {MARLIN_QQQ_SUPPORTED_NUM_BITS} "
+                "are supported.")
+        if self.group_size not in MARLIN_QQQ_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"QQQ does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {MARLIN_QQQ_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+        if self.is_sym not in MARLIN_QQQ_SUPPORTED_SYM:
+            raise ValueError(
+                f"QQQ does not support is_sym = {self.is_sym}. "
+                f"Only sym = {MARLIN_QQQ_SUPPORTED_SYM} are supported.")
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // self.weight_bits
+
+        # Tile size used by QQQ kernels.
+        self.tile_size = MARLIN_QQQ_TILE
+
+        # Min out_features dim
+        self.min_n_threads = MARLIN_QQQ_MIN_THREAD_N
+
+        # Min in_features dim
+        self.min_k_threads = MARLIN_QQQ_MIN_THREAD_K
+
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = MARLIN_QQQ_MAX_PARALLEL
+
+        # Permutation length used by the QQQ kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return "QQQConfig(weight_bits={}, group_size={})".format(
+            self.weight_bits, self.group_size)
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "qqq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        """List of filenames to search for in the model directory."""
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "QQQConfig":
+        weight_bits = cls.get_from_keys(config, ["wbits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QQQLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return QQQLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class QQQLinearMethod(LinearMethodBase):
+    """Linear method for QQQ.
+
+    Args:
+        quant_config: The QQQ quantization config.
+    """
+
+    def __init__(self, quant_config: QQQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}.")
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}.")
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}.")
+        if (self.quant_config.group_size != -1 and
+                input_size_per_partition % self.quant_config.group_size != 0):
+            raise ValueError(f"Weight input_size_per_partition = "
+                             f"{input_size_per_partition} is not divisible by "
+                             f"group_size = {self.quant_config.group_size}.")
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2)
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError(
+                "Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.tile_size,
+                output_size_per_partition * self.quant_config.tile_size //
+                self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+                "marlin_tile_size": self.quant_config.tile_size,
+            },
+        )
+
+        s_channel = Parameter(
+            torch.empty(
+                1,
+                output_size_per_partition,
+                device="cuda",
+                dtype=torch.float,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            s_channel,
+            {
+                "input_dim": None,
+                "output_dim": 1,
+            },
+        )
+
+        if self.quant_config.group_size == -1:
+            s_group = Parameter(
+                torch.tensor(
+                    [],
+                    device="cuda",
+                    dtype=torch.half,
+                ),
+                requires_grad=False,
+            )
+        else:
+            s_group = Parameter(
+                torch.empty(
+                    input_size_per_partition // self.quant_config.group_size,
+                    output_size_per_partition,
+                    device="cuda",
+                    dtype=torch.half,
+                ),
+                requires_grad=False,
+            )
+
+        set_weight_attrs(
+            s_group,
+            {
+                "input_dim": None if self.quant_config.group_size == -1 else 0,
+                "output_dim":
+                None if self.quant_config.group_size == -1 else 1,
+            },
+        )
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_n_threads) * self.quant_config.max_parallel
+        workspace = Parameter(torch.zeros(max_workspace_size,
+                                          device="cuda",
+                                          dtype=torch.int),
+                              requires_grad=False)
+
+        layer.register_parameter("B", qweight)
+        set_weight_attrs(qweight, extra_weight_attrs)
+        layer.register_parameter("s_channel", s_channel)
+        set_weight_attrs(s_channel, extra_weight_attrs)
+        layer.register_parameter("s_group", s_group)
+        set_weight_attrs(s_group, extra_weight_attrs)
+        layer.register_parameter("workspace", workspace)
+        set_weight_attrs(workspace, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.B
+        s_ch = layer.s_channel
+        s_group = layer.s_group
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = s_ch.shape[1]
+
+        x_int8, s_tok = ops.scaled_int8_quant(x_2d)
+
+        output_2d = ops.marlin_qqq_gemm(x_int8, qweight, s_tok, s_ch, s_group,
+                                        workspace, size_m, size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
new file mode 100644
index 0000000000000..cb58eb9458363
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
@@ -0,0 +1,125 @@
+from typing import List
+
+import numpy
+import torch
+
+from .marlin_utils_test import marlin_permute_weights
+from .quant_utils import get_pack_factor, qqq_quantize_weights
+
+
+def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
+                           dtype=numpy.uint32)
+    if group_size == size_k:
+        for i in range(pack_factor):
+            q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i
+    else:
+        for i in range(pack_factor):
+            q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
+
+    return q_packed
+
+
+def get_qqq_scale_perms():
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: List[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501
+def get_qqq_weight_perm(num_bits: int, quant_type: str):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                    4 * (i % 4),
+                    4 * (i % 4) + 1,
+                    4 * (i % 4) + 2,
+                    4 * (i % 4) + 3,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = numpy.array(perm_list)
+
+    assert quant_type in ["per-channel",
+                          "per-group"], "not supported quantization type"
+    if num_bits == 4:
+        if quant_type == "per-channel":
+            interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3])
+        else:
+            interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    else:
+        raise Exception("num_bits must be 4, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size):
+    scale_perm, scale_perm_single = get_qqq_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm]
+        s_channel = s_channel.reshape(
+            (-1, len(scale_perm_single)))[:, scale_perm_single]
+        s_group = s_group.reshape((-1, size_n)).contiguous()
+    else:
+        s_channel = s_channel.reshape(
+            (-1, len(scale_perm_single)))[:, scale_perm_single]
+    s_channel = s_channel.reshape((-1, size_n)).contiguous()
+
+    return s_group, s_channel
+
+
+def marlin_qqq_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+    quant_type = "per-channel" if group_size == size_k else "per-group"
+
+    # Quantize
+    w_ref, q_w, s_group, s_channel = qqq_quantize_weights(
+        w, num_bits, group_size)
+
+    # Reformat to marlin_qqq
+    weight_perm = get_qqq_weight_perm(num_bits, quant_type)
+    marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits,
+                                        weight_perm, group_size)
+    marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales(
+        s_group, s_channel, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [
+        w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel
+    ]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 2ba6a9a810ec0..7ade8bf664ccc 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -205,6 +205,88 @@ def reshape_w(w):
     )
 
 
+# QQQ employs different quant schemes for per-group and
+# per-channel quantization.
+def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
+    orig_device = w.device
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    if group_size < size_k:
+        # Reshape to [groupsize, -1]
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+        max_q_val = 2**num_bits - 1
+        half_q_val = (max_q_val + 1) // 2
+
+        # Compute scale for each group
+        s_group = torch.max(torch.abs(w), 0, keepdim=True)[0]
+        s_group *= 2 / max_q_val  # 2 => symmetric
+
+        # Quantize
+        q_w = torch.round(w / s_group).int()
+        q_w += half_q_val
+        q_w = torch.clamp(q_w, 0, max_q_val)
+        # Compute ref (dequantized)
+        w_ref = (q_w - half_q_val).half() * s_group
+
+        # Restore original shapes
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        q_w = reshape_w(q_w)
+        w_ref = reshape_w(w_ref)
+
+        # Compute int8 quantization scale for each channel
+        s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0]
+        s_channel /= 127.0
+        t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8)
+        w_ref = t_int8.half() * s_channel
+        s_channel = s_channel.reshape(1, -1).to(dtype=torch.float)
+
+        # Fuse scales
+        s_group = (s_group.reshape(-1, size_n).contiguous() /
+                   s_channel).to(dtype=torch.half)
+    else:
+        max_q_val = 2**(num_bits - 1) - 1
+
+        # Compute scale for each channel
+        s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0]
+        s_channel /= max_q_val
+
+        # Quantize
+        q_w = torch.round(w / s_channel).int()
+        q_w = torch.clamp(q_w, -max_q_val, max_q_val)
+        # Compute ref (dequantized)
+        w_ref = q_w.half() * s_channel
+
+        s_group = torch.tensor([], dtype=torch.half)
+        # div 2 ** (8 - self.bits)) to offset right shift in unpacking
+        s_channel /= (2**(8 - num_bits))
+        s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float)
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        s_group.to(device=orig_device),
+        s_channel.to(device=orig_device),
+    )
+
+
 def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
     orig_device = q_w.device
 

From 2f4e108f75c817bf2f323e306db590e13d2863f6 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Wed, 31 Jul 2024 22:39:19 +0800
Subject: [PATCH 104/167] [Bugfix] Clean up MiniCPM-V (#6939)

Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.rst |   6 +-
 vllm/model_executor/models/llama.py     |   4 +-
 vllm/model_executor/models/minicpm.py   |   4 +-
 vllm/model_executor/models/minicpmv.py  | 249 +++++---
 vllm/model_executor/models/na_vit.py    | 804 ++++++++++++++++++++++++
 vllm/model_executor/models/qwen2.py     |   2 +-
 6 files changed, 975 insertions(+), 94 deletions(-)
 create mode 100644 vllm/model_executor/models/na_vit.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 4fe33e5ab5d80..a1ea366b82b04 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -222,9 +222,13 @@ Vision Language Models
     -
   * - :code:`MiniCPM-V`
     - MiniCPM-V
-    - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
+    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
     -
 
+.. note::
+  For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
+  For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
+
 ----
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 306d22e42ed1d..2052c443a8885 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -418,11 +418,9 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        input_embeds: Optional[torch.Tensor] = None
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
-                                  input_embeds)
+                                  attn_metadata, intermediate_tensors)
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 7a8ac0bb1f949..b46e88f5fc584 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -370,6 +370,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
@@ -463,11 +464,10 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-        input_embeds: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, input_embeds)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 8563216d9c392..2a7fe7ba0ebac 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -20,32 +20,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only MiniCPM-V-2 model compatible with HuggingFace weights."""
+"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
 from functools import partial
-from typing import Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torch.nn.functional as F
+import torch.types
 from PIL import Image
 from torch import nn
 from torch.nn.init import trunc_normal_
 from transformers.configuration_utils import PretrainedConfig
-from transformers.models.idefics2.modeling_idefics2 import (
-    Idefics2VisionTransformer)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import SupportsVision
-from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.models.minicpm import MiniCPMModel
+from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import (cached_get_image_processor,
@@ -53,12 +55,12 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
 
 _KEYS_TO_MODIFY_MAPPING = {
-    "language_model.lm_head": "lm_head",
-    "language_model.model": "language_model",
+    "llm.lm_head": "lm_head",
+    "llm.model": "llm",
 }
 
 
-def get_abs_pos(abs_pos, tgt_size):
+def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor):
     # abs_pos: L, C
     # tgt_size: (H, W)
     # return: M, C
@@ -75,10 +77,10 @@ def get_abs_pos(abs_pos, tgt_size):
 
 
 # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
-def get_2d_sincos_pos_embed(embed_dim,
-                            grid_size,
-                            cls_token=False,
-                            version=2.0):
+def get_2d_sincos_pos_embed(embed_dim: int,
+                            grid_size: Union[int, Tuple[int, int]],
+                            cls_token: bool = False,
+                            version: Tuple[int, int] = (2, 0)):
     """
     grid_size: int of the grid height and width
     return:
@@ -95,7 +97,7 @@ def get_2d_sincos_pos_embed(embed_dim,
     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
     grid = np.stack(grid, axis=0)
 
-    if version == 2.0:
+    if version == (2, 0):
         grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
         pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
         if cls_token:
@@ -106,7 +108,9 @@ def get_2d_sincos_pos_embed(embed_dim,
     return pos_embed
 
 
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version=2.0):
+def get_2d_sincos_pos_embed_from_grid(embed_dim: int,
+                                      grid: Union[int, Tuple[int, int]],
+                                      version: Tuple[int, int] = (2, 0)):
     assert embed_dim % 2 == 0
 
     # use half of dimensions to encode grid_h
@@ -115,14 +119,16 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version=2.0):
     emb_w = get_1d_sincos_pos_embed_from_grid(
         embed_dim // 2, grid[1], version)  # (H*W, D/2) or (H, W, D/2)
 
-    if version == 2.0:
+    if version == (2, 0):
         emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
     else:
         emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
     return emb
 
 
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, version=2.0):
+def get_1d_sincos_pos_embed_from_grid(embed_dim: int,
+                                      pos: int,
+                                      version: Tuple[int, int] = (2, 0)):
     """
     embed_dim: output dimension for each position
     pos: a list of positions to be encoded: size (M,) / (H, W)
@@ -133,7 +139,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, version=2.0):
     omega /= embed_dim / 2.
     omega = 1. / 10000**omega  # (D/2,)
 
-    if version == 2.0:
+    if version == (2, 0):
         pos = pos.reshape(-1)  # (M,)
         out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
         emb_sin = np.sin(out)  # (M, D/2)
@@ -158,19 +164,19 @@ class Resampler(nn.Module):
     default_norm_layer = partial(nn.LayerNorm, eps=1e-6)
 
     def __init__(self,
-                 num_queries,
-                 grid_size,
-                 embed_dim,
-                 num_heads,
-                 kv_dim=None,
-                 norm_layer=default_norm_layer,
-                 adaptive=False,
-                 max_size=(70, 70),
-                 version=2.0):
+                 num_queries: int,
+                 grid_size: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: nn.Module = default_norm_layer,
+                 adaptive: bool = False,
+                 max_size: Tuple[int, int] = (70, 70),
+                 version: Tuple[int, int] = (2, 0)):
         super().__init__()
 
         self.version = version
-        if self.version == 2.0:
+        if self.version == (2, 0):
             self.num_queries = grid_size**2
         else:
             self.num_queries = num_queries
@@ -195,7 +201,7 @@ def __init__(self,
         self.proj = nn.Parameter(
             (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
 
-        if self.version == 2.0:
+        if self.version == (2, 0):
             self.pos_embed = nn.Parameter(
                 torch.from_numpy(
                     get_2d_sincos_pos_embed(
@@ -206,14 +212,17 @@ def __init__(self,
 
         self.apply(self._init_weights)
 
-    def _set_2d_pos_cache(self, max_size, device='cpu'):
+    def _set_2d_pos_cache(self,
+                          max_size: Tuple[int, int],
+                          device: torch.types.Device = 'cpu'):
         pos_embed = torch.from_numpy(
             get_2d_sincos_pos_embed(self.embed_dim,
                                     max_size,
                                     version=self.version)).float().to(device)
         self.register_buffer("pos_embed", pos_embed, persistent=False)
 
-    def _adjust_pos_cache(self, tgt_sizes, device):
+    def _adjust_pos_cache(self, tgt_sizes: torch.Tensor,
+                          device: torch.types.Device):
         max_h = torch.max(tgt_sizes[:, 0])
         max_w = torch.max(tgt_sizes[:, 1])
         if max_h > self.max_size[0] or max_w > self.max_size[1]:
@@ -223,7 +232,7 @@ def _adjust_pos_cache(self, tgt_sizes, device):
             ]
             self._set_2d_pos_cache(self.max_size, device)
 
-    def _init_weights(self, m):
+    def _init_weights(self, m: nn.Module):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
@@ -232,7 +241,9 @@ def _init_weights(self, m):
             nn.init.constant_(m.bias, 0)
             nn.init.constant_(m.weight, 1.0)
 
-    def forward_2_5(self, x, tgt_sizes=None):
+    def forward_2_5(self,
+                    x: torch.Tensor,
+                    tgt_sizes: Optional[torch.Tensor] = None):
         assert x.shape[0] == tgt_sizes.shape[0]
         bs = x.shape[0]
 
@@ -278,7 +289,10 @@ def forward_2_5(self, x, tgt_sizes=None):
         x = x @ self.proj
         return x
 
-    def forward_2(self, x, tgt_sizes=None, attn_mask=None):
+    def forward_2(self,
+                  x: torch.Tensor,
+                  tgt_sizes: Optional[torch.Tensor] = None,
+                  attn_mask: Optional[torch.Tensor] = None):
         if self.adaptive:
             pos_embed = torch.Tensor(
                 get_2d_sincos_pos_embed(self.embed_dim,
@@ -302,8 +316,11 @@ def forward_2(self, x, tgt_sizes=None, attn_mask=None):
         x = x @ self.proj
         return x
 
-    def forward(self, x, tgt_sizes=None, attn_mask=None):
-        if self.version == 2.0:
+    def forward(self,
+                x: torch.Tensor,
+                tgt_sizes: Optional[torch.Tensor] = None,
+                attn_mask: Optional[torch.Tensor] = None):
+        if self.version == (2, 0):
             return self.forward_2(x, tgt_sizes=tgt_sizes, attn_mask=attn_mask)
         else:
             return self.forward_2_5(x, tgt_sizes=tgt_sizes)
@@ -322,7 +339,7 @@ def dummy_seq_data_for_minicpmv(seq_len: int):
     return SequenceData(token_ids)
 
 
-def dummy_image_for_minicpmv(hf_config):
+def dummy_image_for_minicpmv(hf_config: PretrainedConfig):
     width = height = hf_config.image_size
     image = Image.new("RGB", (width, height), color=0)
     return {"image": image}
@@ -381,7 +398,7 @@ class MiniCPMV(nn.Module, SupportsVision):
 
     def __init__(
         self,
-        config,
+        config: PretrainedConfig,
         multimodal_config: MultiModalConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -390,30 +407,48 @@ def __init__(
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.version = float(self.config.version)
+        if not hasattr(self.config, "version"):
+            if self.config.hidden_size == 2304 and self.config.query_num == 64:
+                self.version = (2, 0)
+            else:
+                self.version = (2, 5)
+        else:
+            self.version = str(self.config.version).split(".")
+            self.version = tuple([int(x) for x in self.version])
         self.llm = self.init_llm(config, cache_config, quant_config)
         self.vpm = self.init_vision_module()
         param_dtype = torch.get_default_dtype()
         self.vpm.to(dtype=param_dtype)
-        self.vision_dim = self.vpm.embed_dim if self.version == 2.0 \
+        self.vision_dim = self.vpm.embed_dim if self.version == (2, 0) \
             else self.vpm.embeddings.embed_dim
-        self.embed_dim = self.llm.config.hidden_size
+        self.embed_dim = self.config.hidden_size
         self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
         self.resampler.to(device="cuda", dtype=param_dtype)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
-    def init_llm(self, config, cache_config, quant_config):
-        if self.version == 2.0:
-            return MiniCPMForCausalLM(config,
-                                      cache_config=cache_config,
-                                      quant_config=quant_config)
+    def init_llm(self,
+                 config: PretrainedConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        if self.version == (2, 0):
+            return MiniCPMModel(config,
+                                cache_config=cache_config,
+                                quant_config=quant_config)
+        elif self.version == (2, 5):
+            return LlamaModel(config,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
         else:
-            return LlamaForCausalLM(config,
-                                    cache_config=cache_config,
-                                    quant_config=quant_config)
+            return Qwen2Model(config,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
 
     def init_vision_module(self):
-        if self.version == 2.0:
+        if self.version == (2, 0):
             try:
                 import timm
             except ImportError:
@@ -433,16 +468,30 @@ def init_vision_module(self):
 
             if self.config.drop_vision_last_layer:
                 model.blocks = model.blocks[:-1]
-        else:
+        elif self.version == (2, 5):
+            from transformers.models.idefics2.modeling_idefics2 import (
+                Idefics2VisionTransformer)
             model = Idefics2VisionTransformer(self.config.vision_config)
             if self.config.drop_vision_last_layer:
                 model.encoder.layers = model.encoder.layers[:-1]
+        else:
+            from vllm.model_executor.models.na_vit import (
+                SiglipVisionTransformer)
+            if self.config._attn_implementation == 'flash_attention_2':
+                self.config.vision_config._attn_implementation \
+                    = 'flash_attention_2'
+            else:
+                # not support sdpa
+                self.config.vision_config._attn_implementation = 'eager'
+            model = SiglipVisionTransformer(self.config.vision_config)
+            if self.config.drop_vision_last_layer:
+                model.encoder.layers = model.encoder.layers[:-1]
         return model
 
-    def init_resampler(self, embed_dim, vision_dim):
+    def init_resampler(self, embed_dim: int, vision_dim: int):
         default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(torch.float16)
-        if self.version == 2.0:
+        if self.version == (2, 0):
             resampler = Resampler(grid_size=int(
                 math.sqrt(self.config.query_num)),
                                   num_queries=None,
@@ -463,11 +512,11 @@ def init_resampler(self, embed_dim, vision_dim):
         return resampler
 
     def get_vision_embedding(self,
-                             pixel_values,
-                             patch_attn_mask=None,
-                             tgt_sizes=None,
-                             version=2.0):
-        if version == 2.0:
+                             pixel_values: List[List[torch.Tensor]],
+                             patch_attn_mask: Optional[torch.Tensor] = None,
+                             tgt_sizes: Optional[torch.Tensor] = None,
+                             version: Tuple[int, int] = (2, 0)):
+        if version == (2, 0):
             res = []
             dtype = self.vpm.pos_embed.data.dtype
             for pixel_value in pixel_values:
@@ -484,21 +533,32 @@ def get_vision_embedding(self,
                                                         num_prefix_tokens:]
                 res.append(self.resampler(vision_embedding, tgt_size))
             return torch.vstack(res)
-        else:
+        elif version == (2, 5):
             vision_embedding = self.vpm(
                 pixel_values.type(dtype),
                 patch_attention_mask=patch_attn_mask).last_hidden_state
             vision_embedding = self.resampler(vision_embedding, tgt_sizes)
+        else:
+            vision_embedding = self.vpm(pixel_values.type(dtype),
+                                        patch_attention_mask=patch_attn_mask,
+                                        tgt_sizes=tgt_sizes).last_hidden_state
 
-    def get_image_bounds(self, input_ids):
+    def get_image_bounds(self, input_ids: torch.Tensor):
         tokenizer = cached_get_tokenizer(self.config._name_or_path,
                                          trust_remote_code=True)
-        im_start_token_id = tokenizer.im_start_id
-        im_end_token_id = tokenizer.im_end_id
-        image_start_tokens = torch.where(input_ids == im_start_token_id)[0]
+        if not hasattr(tokenizer, "slice_start_id"):
+            start_cond = input_ids == tokenizer.im_start_id
+            end_cond = input_ids == tokenizer.im_end_id
+        else:
+            start_cond = (input_ids == tokenizer.im_start_id) | (
+                input_ids == tokenizer.slice_start_id)
+            end_cond = (input_ids == tokenizer.im_end_id) | (
+                input_ids == tokenizer.slice_end_id)
+
+        image_start_tokens = torch.where(start_cond)[0]
         image_start_tokens += 1
-        image_end_tokens = torch.where(input_ids == im_end_token_id)[0]
-        valid_image_nums = min(len(image_start_tokens), len(image_end_tokens))
+        image_end_tokens = torch.where(end_cond)[0]
+        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
         if valid_image_nums == 0:
             return []
         image_bound = torch.hstack([
@@ -508,12 +568,14 @@ def get_image_bounds(self, input_ids):
 
         return image_bound
 
-    def get_vision_hidden_states(self, data):
+    def get_vision_hidden_states(self, data: Dict[str,
+                                                  Union[List[torch.Tensor],
+                                                        torch.Tensor]]):
         if "vision_hidden_states" not in data:
             pixel_values = data["pixel_values"]
             tgt_sizes = data["tgt_sizes"]
             vision_hidden_states = []
-            if self.version == 2.0:
+            if self.version == (2, 0):
                 if pixel_values is not None and len(pixel_values) > 0:
                     vision_hidden_states = self.get_vision_embedding(
                         pixel_values)
@@ -534,17 +596,26 @@ def get_vision_hidden_states(self, data):
                     B, L, _ = all_pixel_values.shape
                     all_pixel_values = all_pixel_values.permute(
                         0, 2, 1).reshape(B, 3, -1, L)
-
                     patch_attn_mask = torch.zeros((B, 1, max_patches),
                                                   dtype=torch.bool,
                                                   device=device)
-                    for i in range(B):
-                        patch_attn_mask[i, :tgt_sizes[i][0] *
-                                        tgt_sizes[i][1]] = True
+                    if self.version == (2, 5):
+                        for i in range(B):
+                            patch_attn_mask[i, :tgt_sizes[i][0] *
+                                            tgt_sizes[i][1]] = True
+                        vision_embedding = self.vpm(
+                            all_pixel_values.type(dtype),
+                            patch_attention_mask=patch_attn_mask
+                        ).last_hidden_state
+                    else:
+                        for i in range(B):
+                            patch_attn_mask[i, 0, :tgt_sizes[i][0] *
+                                            tgt_sizes[i][1]] = True
+                        vision_embedding = self.vpm(
+                            all_pixel_values.type(dtype),
+                            patch_attention_mask=patch_attn_mask,
+                            tgt_sizes=tgt_sizes).last_hidden_state
 
-                    vision_embedding = self.vpm(
-                        all_pixel_values.type(dtype),
-                        patch_attention_mask=patch_attn_mask).last_hidden_state
                     vision_hidden_states = self.resampler(
                         vision_embedding, tgt_sizes)
 
@@ -556,7 +627,8 @@ def get_vision_hidden_states(self, data):
 
         return vision_hidden_states
 
-    def get_embedding(self, data):
+    def get_embedding(self, data: Dict[str, Union[List[torch.Tensor],
+                                                  torch.Tensor]]):
         input_ids = data["input_ids"]
 
         vision_hidden_states = self.get_vision_hidden_states(data)
@@ -565,11 +637,11 @@ def get_embedding(self, data):
         else:
             image_bounds = []
 
-        if hasattr(self.llm.config, 'scale_emb'):
-            vlm_embedding = self.llm.model.embed_tokens(
-                input_ids) * self.llm.config.scale_emb
+        if hasattr(self.config, 'scale_emb'):
+            vlm_embedding = self.llm.embed_tokens(
+                input_ids) * self.config.scale_emb
         else:
-            vlm_embedding = self.llm.model.embed_tokens(input_ids)
+            vlm_embedding = self.llm.embed_tokens(input_ids)
         vision_hidden_states = [
             i.type(vlm_embedding.dtype) if isinstance(i, torch.Tensor) else i
             for i in vision_hidden_states
@@ -587,7 +659,9 @@ def get_embedding(self, data):
                 vision_hidden_states.view(-1, vision_hidden_states.shape[-1]))
         return vlm_embedding, vision_hidden_states
 
-    def process_multimodal_inputs(self, inputs):
+    def process_multimodal_inputs(self, inputs: Dict[str,
+                                                     Union[List[torch.Tensor],
+                                                           torch.Tensor]]):
         pixel_values = []
         tgt_sizes = []
         for b in range(len(inputs["pixel_values"])):
@@ -613,7 +687,6 @@ def forward(
             "input_ids": input_ids,
             "tgt_sizes": kwargs.pop("tgt_sizes", None),
         }
-
         inputs = self.process_multimodal_inputs(inputs)
 
         vlm_embeddings, vision_hidden_states = self.get_embedding(inputs)
@@ -623,19 +696,21 @@ def forward(
                           kv_caches=kv_caches,
                           attn_metadata=attn_metadata,
                           intermediate_tensors=intermediate_tensors,
-                          input_embeds=vlm_embeddings)
+                          inputs_embeds=vlm_embeddings)
         return output
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        return self.llm.compute_logits(hidden_states, sampling_metadata)
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
 
     def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        next_tokens = self.llm.sample(logits, sampling_metadata)
+        next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
@@ -649,9 +724,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            #     for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
-            #         if key_to_modify in name:
-            #             name = name.replace(key_to_modify, new_key)
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/na_vit.py b/vllm/model_executor/models/na_vit.py
new file mode 100644
index 0000000000000..871e4128b66e1
--- /dev/null
+++ b/vllm/model_executor/models/na_vit.py
@@ -0,0 +1,804 @@
+import logging
+import math
+import os
+import warnings
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (ModelOutput, is_flash_attn_2_available,
+                                replace_return_docstrings)
+
+logger = logging.getLogger("vllm")
+
+
+# For Siglip: copied from
+#   HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
+# Remove hints as there's little possibility to change these code.
+class SiglipVisionConfig(PretrainedConfig):
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(
+                cls,
+                "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                "You are using a model of type %s to "
+                "instantiate a model of type %s. "
+                "This is not supported for all configurations"
+                "of models and can yield errors.", config_dict['model_type'],
+                cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l_ = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l_ - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(tensor: torch.Tensor,
+                     mean: float = 0.0,
+                     std: float = 1.0,
+                     a: float = -2.0,
+                     b: float = 2.0) -> torch.Tensor:
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+class SiglipVisionModelOutput(ModelOutput):
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                patch_attention_mask: torch.BoolTensor,
+                tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = (max_im_h // self.patch_size,
+                                              max_im_w // self.patch_size)
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
+                                  1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h,
+                                              boundaries,
+                                              right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w,
+                                              boundaries,
+                                              right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
+                       bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads (got `embed_dim`: "
+                f"{self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(
+            2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len,
+                                   k_v_seq_len):
+            raise ValueError(
+                "Attention weights should be of size "
+                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}")
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    "Attention mask should be of size "
+                    f"{(batch_size, 1, q_len, k_v_seq_len)}",
+                    f"but is {attention_mask.size()}")
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights,
+                                             dim=-1,
+                                             dtype=torch.float32).to(
+                                                 query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights,
+                                             p=self.dropout,
+                                             training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len,
+                                  self.head_dim):
+            raise ValueError(
+                "`attn_output` should be of size "
+                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, "
+                "but is"
+                f" {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipFlashAttention2(SiglipAttention):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(
+                kv_seq_len, self.layer_idx)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.dropout if self.training else 0.0
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning(
+                "The input hidden states seems to be "
+                "silently casted in float32, "
+                "this might be related to the fact "
+                "you have upcasted embedding or layer norm layers in float32. "
+                "We will cast back the input in"
+                " %s.", target_dtype)
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(query_states,
+                                                    key_states,
+                                                    value_states,
+                                                    attention_mask,
+                                                    q_len,
+                                                    dropout=dropout_rate)
+
+        attn_output = attn_output.reshape(bsz, q_len,
+                                          self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+    def _flash_attention_forward(self,
+                                 query_states,
+                                 key_states,
+                                 value_states,
+                                 attention_mask,
+                                 query_length,
+                                 dropout=0.0,
+                                 softmax_scale=None):
+        causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            (query_states, key_states, value_states, indices_q, cu_seq_lens,
+             max_seq_lens) = self._upad_input(query_states, key_states,
+                                              value_states, attention_mask,
+                                              query_length)
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
+                                    query_length)
+        else:
+            attn_output = flash_attn_func(query_states,
+                                          key_states,
+                                          value_states,
+                                          dropout,
+                                          softmax_scale=softmax_scale,
+                                          causal=causal)
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
+                    query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
+            attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                              head_dim), indices_k)
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                                head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
+                                    head_dim), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            (query_layer, indices_q, cu_seqlens_q,
+             max_seqlen_in_batch_q) = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer
+# with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self._use_flash_attention_2 = (
+            config._attn_implementation == "flash_attention_2")
+        self.self_attn = (SiglipAttention(config)
+                          if not self._use_flash_attention_2 else
+                          SiglipFlashAttention2(config))
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states, )
+
+        if output_attentions:
+            outputs += (attn_weights, )
+
+        return outputs
+
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    config_class = SiglipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = self.config.hidden_size
+            nn.init.normal_(module.position_embedding.weight,
+                            std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder
+# with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None \
+                                else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None \
+                        else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1], )
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden_states,
+                               hidden_states=encoder_states,
+                               attentions=all_attentions)
+
+
+class SiglipVisionTransformer(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = (
+            config._attn_implementation == "flash_attention_2")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None \
+                                else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None \
+                        else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s
+        # (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask,
+        # which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (_prepare_4d_attention_mask(
+                patch_attention_mask, hidden_states.dtype)
+                              if not self._use_flash_attention_2 else
+                              patch_attention_mask)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state, None) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 3deb3d8840cc4..35fd6f37589a0 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -342,7 +342,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,

From daed30c4a917c870f8fbddf45e3b027710c0842b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 31 Jul 2024 23:46:17 +0800
Subject: [PATCH 105/167] [Bugfix] Fix feature size calculation for LLaVA-NeXT
 (#6982)

---
 tests/models/test_llava_next.py          | 88 +++++++++++++++++++-----
 vllm/model_executor/models/fuyu.py       |  2 +-
 vllm/model_executor/models/internvl.py   |  6 +-
 vllm/model_executor/models/llava_next.py | 48 ++++++-------
 vllm/model_executor/models/phi3v.py      |  4 +-
 5 files changed, 98 insertions(+), 50 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 9c64f39eb6d08..b6d72dee5c5b5 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,7 +1,7 @@
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, overload
 
 import pytest
-from transformers import AutoConfig, AutoTokenizer
+from transformers import AutoTokenizer
 
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -50,6 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
+@overload
 def run_test(
     hf_runner: Type[HfRunner],
     vllm_runner: Type[VllmRunner],
@@ -62,13 +63,55 @@ def run_test(
     num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    sizes: List[Tuple[int, int]],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: Optional[List[float]] = None,
+    sizes: Optional[List[Tuple[int, int]]] = None,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
 ):
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    if size_factors is not None:
+        inputs_per_image = [(
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    elif sizes is not None:
+        inputs_per_image = [(
+            [prompt for _ in sizes],
+            [image.resize(size) for size in sizes],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
@@ -150,15 +193,24 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     )
 
 
-@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
-                                                         (183, 488, 776)])
-def test_image_feature_size(height_and_width_and_result):
-    # Avoid initializing CUDA too early in distributed tests
-    from vllm.model_executor.models.llava_next import (
-        get_llava_next_image_feature_size)
-
-    height, width, result = height_and_width_and_result
-    config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-    assert get_llava_next_image_feature_size(config,
-                                             input_height=height,
-                                             input_width=width) == result
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "sizes",
+    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes,
+                            dtype, max_tokens, num_logprobs) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        sizes=sizes,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index fdea8ee30ce68..c4738263c3056 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -169,7 +169,7 @@ def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs):
         raise TypeError(f"Invalid image type: {type(image_data)}")
 
     # process prompts
-    prompt = llm_inputs["prompt"]
+    prompt = llm_inputs.get("prompt")
     prompt_token_ids = llm_inputs["prompt_token_ids"]
     tokenizer = cached_get_tokenizer(model_config.model)
     # dim0 is batch_size, dim1 is subseq_size which will always be 1
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index f64c78c15f8ee..eabc283b1efdb 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -20,7 +20,7 @@
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.intern_vit import InternVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.image import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SamplerOutput
@@ -43,7 +43,7 @@
 
 class InternVLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: BatchedTensors
+    data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 
@@ -193,7 +193,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
     tokenizer = cached_get_tokenizer(model_config.tokenizer,
                                      trust_remote_code=True)
 
-    prompt = llm_inputs["prompt"]
+    prompt = llm_inputs.get("prompt")
     prompt_token_ids = llm_inputs["prompt_token_ids"]
     if prompt is None:
         prompt = tokenizer.decode(prompt_token_ids)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 5abb55c2cc415..4a67b9a583ea8 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -21,7 +21,7 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
@@ -43,7 +43,7 @@
 
 class LlavaNextImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: BatchedTensors
+    data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 
@@ -62,31 +62,26 @@ class LlavaNextImagePixelInputs(TypedDict):
 LlavaNextImageInputs = LlavaNextImagePixelInputs
 
 
-# Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91
-# NOTE: new_height and new_width are further incremented to properly invert the
-# floordiv operation: https://github.com/huggingface/transformers/blob/v4.42.2/src/transformers/models/llava_next/modeling_llava_next.py#L133
+# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
 def _get_llava_next_num_unpadded_features(
-    height: int,
-    width: int,
+    original_height: int,
+    original_width: int,
     npatches: int,
     num_patch_height: int,
     num_patch_width: int,
 ) -> Tuple[int, int]:
     current_height = npatches * num_patch_height
     current_width = npatches * num_patch_width
-    current_height = torch.tensor(current_height).to("cuda")
-    current_width = torch.tensor(current_width).to("cuda")
 
-    aspect_ratio: float = width / height
-    current_aspect_ratio: float = current_width / current_height
+    aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
     if aspect_ratio > current_aspect_ratio:
-        scale_factor = current_width / width
-        new_height = int(height * scale_factor)
+        new_height = (original_height * current_width) // original_width
         padding = (current_height - new_height) // 2
         current_height -= padding * 2
     else:
-        scale_factor = current_height / height
-        new_width = int(width * scale_factor)
+        new_width = (original_width * current_height) // original_height
         padding = (current_width - new_width) // 2
         current_width -= padding * 2
 
@@ -95,7 +90,7 @@ def _get_llava_next_num_unpadded_features(
     return (unpadded_features, newline_features)
 
 
-# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L111
+# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
 def get_llava_next_image_feature_size(
     hf_config: LlavaNextConfig,
     *,
@@ -111,9 +106,7 @@ def get_llava_next_image_feature_size(
         )
         base_feature_size = num_patches * num_patches
 
-        # Note: We follow the "wrong" width/height order
-        # [ref: PR huggingface/transformers#31588]
-        num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
             image_size=(input_height, input_width),
             grid_pinpoints=hf_config.image_grid_pinpoints,
             patch_size=vision_config.image_size,
@@ -349,11 +342,12 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
             if patch_embeddings.shape[0] > 1:
                 other_patch_embeds = patch_embeddings[1:]
 
+                # Move to CPU to avoid floating-point errors
+                orig_height, orig_width = image_size.tolist()
+
                 # image_aspect_ratio == "anyres"
-                # Note: We follow the "wrong" width/height order
-                # [ref: PR huggingface/transformers#31588]
-                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-                    image_size,
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    (orig_height, orig_width),
                     self.config.image_grid_pinpoints,
                     self.config.vision_config.image_size,
                 )
@@ -365,7 +359,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                         .permute(4, 0, 2, 1, 3).contiguous() \
                         .flatten(1, 2).flatten(2, 3)
                     other_patch_embeds = unpad_image(other_patch_embeds,
-                                                     image_size)
+                                                     (orig_height, orig_width))
                     other_patch_embeds = torch.cat((
                         other_patch_embeds,
                         self.image_newline[:, None, None] \
@@ -398,7 +392,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
     def _process_image_pixels(
         self,
         inputs: LlavaNextImagePixelInputs,
-    ) -> BatchedTensors:
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["data"]
@@ -425,7 +419,9 @@ def _process_image_pixels(
         ]
 
     def _process_image_input(
-            self, image_input: LlavaNextImageInputs) -> BatchedTensors:
+        self,
+        image_input: LlavaNextImageInputs,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
         patch_embeddings = self._process_image_pixels(image_input)
 
         image_sizes = image_input.get("image_sizes")
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 75e2f5fc95cb7..823c34b101870 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
@@ -261,7 +261,7 @@ def add_image_newline(self, image_features_hd):
 
 class Phi3VImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: BatchedTensors
+    data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 

From 2ee8d3ba55f1175162dbc8e70b76674197b127c6 Mon Sep 17 00:00:00 2001
From: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Date: Wed, 31 Jul 2024 22:00:24 +0300
Subject: [PATCH 106/167] [Model] use FusedMoE layer in Jamba (#6935)

---
 vllm/model_executor/models/jamba.py | 157 +++++++++-------------------
 1 file changed, 49 insertions(+), 108 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3444578227259..cf407c86acd7d 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-"""Inference-only Jurassic model."""
+"""Inference-only Jamba model."""
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple
 
@@ -15,10 +15,9 @@
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+                              get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -282,108 +281,50 @@ def forward(self, x):
 
 
 class JambaMoE(nn.Module):
-    """A tensor-parallel MoE implementation for Mixtral that shards each expert
-    across all ranks.
 
-    Each expert's weights are sharded across all ranks and a fused MoE
-    kernel is used for the forward pass, and finally we reduce the outputs
-    across ranks.
-    """
-
-    def __init__(
-        self,
-        config: JambaConfig,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+    def __init__(self,
+                 config: JambaConfig,
+                 num_experts: Optional[int] = None,
+                 top_k: Optional[int] = None,
+                 params_dtype: Optional[torch.dtype] = None,
+                 tp_size: Optional[int] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
-        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
-        self.num_total_experts = config.num_experts
-        self.top_k = config.num_experts_per_tok
+        self.num_total_experts = num_experts or config.num_experts
+        self.top_k = top_k or config.num_experts_per_tok
         self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size // self.tp_size
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-        self.params_dtype = params_dtype
-
-        self.router = ReplicatedLinear(self.hidden_size,
-                                       self.num_total_experts,
-                                       bias=False,
-                                       params_dtype=self.params_dtype)
-
-        self.ws = nn.Parameter(
-            torch.empty(
-                self.num_total_experts,
-                2 * self.intermediate_size,
-                self.hidden_size,
-                device="cuda",
-                dtype=self.params_dtype,
-            ))
-        self.w2s = nn.Parameter(
-            torch.empty(
-                self.num_total_experts,
-                self.hidden_size,
-                self.intermediate_size,
-                device="cuda",
-                dtype=self.params_dtype,
-            ))
+        self.intermediate_size = config.intermediate_size
 
-        set_weight_attrs(
-            self.ws,
-            {
-                "weight_loader": self.weight_loader,
-            },
-        )
-        set_weight_attrs(
-            self.w2s,
-            {
-                "weight_loader": self.weight_loader,
-            },
-        )
-
-    def weight_loader(
-        self,
-        param: nn.Parameter,
-        loaded_weight: torch.Tensor,
-        weight_name: str,
-        expert_id: int,
-    ):
-        tp_rank = get_tensor_model_parallel_rank()
-        param_data = param.data
-        shard_size = self.intermediate_size
-        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
-        if weight_name.endswith("gate_proj.weight"):
-            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("up_proj.weight"):
-            param_data[expert_id,
-                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("down_proj.weight"):
-            param_data[expert_id, :, :] = loaded_weight[:, shard]
+        if self.num_total_experts > 1:
+            self.router = ReplicatedLinear(self.hidden_size,
+                                           self.num_total_experts,
+                                           bias=False,
+                                           quant_config=None,
+                                           params_dtype=params_dtype)
+
+        self.experts = FusedMoE(self.num_total_experts,
+                                self.top_k,
+                                self.hidden_size,
+                                self.intermediate_size,
+                                tp_size=tp_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=False,
+                                use_grouped_topk=False,
+                                quant_config=quant_config)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_size = hidden_states.shape
+        orig_shape = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (batch * sequence_length, n_experts)
-        router_logits, _ = self.router(hidden_states)
-
-        final_hidden_states = fused_moe(
-            hidden_states,
-            self.ws,
-            self.w2s,
-            router_logits,
-            self.top_k,
-            renormalize=
-            False,  # Mixtral normalize the expert probs to 1. We don't!
-            inplace=True,
-        )
-
-        if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
-        return final_hidden_states.view(num_tokens, hidden_size)
+        if self.num_total_experts > 1:
+            router_logits, _ = self.router(hidden_states)
+        else:
+            router_logits = torch.ones((hidden_states.shape[0], 1),
+                                       device=hidden_states.device,
+                                       dtype=hidden_states.dtype)
+        hidden_states = self.experts(hidden_states, router_logits)
+        return hidden_states.view(orig_shape)
 
 
 class JambaMambaDecoderLayer(nn.Module):
@@ -917,15 +858,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
 
-        expert_params_mapping = [
-            # (param_name, weight_name, expert_id)
-            (
-                "ws" if weight_name in ["gate_proj", "up_proj"] else "w2s",
-                f"experts.{expert_id}.{weight_name}.weight",
-                expert_id,
-            ) for expert_id in range(self.config.num_experts)
-            for weight_name in ["down_proj", "up_proj", "gate_proj"]
-        ]
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
@@ -952,7 +891,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for param_name, weight_name, expert_id in expert_params_mapping:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
@@ -961,6 +901,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader(param,
                                   loaded_weight,
                                   weight_name,
+                                  shard_id=shard_id,
                                   expert_id=expert_id)
                     break
                 else:

From bd700134072d9513902b42f3ef20a7cd8a1c6377 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 31 Jul 2024 12:02:17 -0700
Subject: [PATCH 107/167] [MISC] Introduce pipeline parallelism partition
 strategies (#6920)

Co-authored-by: youkaichao <youkaichao@126.com>
---
 tests/distributed/test_pipeline_partition.py | 34 ++++++++++++++++++++
 vllm/distributed/utils.py                    | 32 +++++++++++++++---
 vllm/envs.py                                 |  5 +++
 3 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 tests/distributed/test_pipeline_partition.py

diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
new file mode 100644
index 0000000000000..2d4d07dd27522
--- /dev/null
+++ b/tests/distributed/test_pipeline_partition.py
@@ -0,0 +1,34 @@
+import os
+
+import pytest
+
+from vllm.distributed.utils import get_pp_indices
+
+
+def test_custom_layer_partition():
+
+    def _verify(partition_str, num_layers, pp_size, goldens):
+        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
+        for pp_rank, golden in enumerate(goldens):
+            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+        if bak is not None:
+            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+
+    # Even partition
+    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Balanced partition
+    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+    # Put reminder somewhere
+    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+    # Invalid partition strings
+    with pytest.raises(ValueError):
+        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    with pytest.raises(ValueError):
+        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Wrong number of partitions
+    with pytest.raises(ValueError):
+        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Wrong number of layers
+    with pytest.raises(ValueError):
+        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index b5cf6c45f478f..8c94ef8cb10ce 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -6,6 +6,11 @@
 
 import torch
 
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
@@ -54,11 +59,28 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
     If the number of layers is not divisible by the number of partitions,
     the last partition will have the remaining layers.
     """
-    layers_per_partition = num_hidden_layers // pp_size
-    start_layer = pp_rank * layers_per_partition
-    end_layer = start_layer + layers_per_partition
+    partition_list_str = envs.VLLM_PP_LAYER_PARTITION
+    if partition_list_str is not None:
+        try:
+            partitions = [
+                int(layer) for layer in partition_list_str.split(",")
+            ]
+        except ValueError as err:
+            raise ValueError("Invalid partition string: {}".format(
+                partition_list_str)) from err
+        if len(partitions) != pp_size:
+            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
+        if sum(partitions) != num_hidden_layers:
+            raise ValueError(
+                f"{sum(partitions)=} does not match {num_hidden_layers=}.")
+        start_layer = sum(partitions[:pp_rank])
+        end_layer = start_layer + partitions[pp_rank]
+    else:
+        layers_per_partition = num_hidden_layers // pp_size
+        start_layer = pp_rank * layers_per_partition
+        end_layer = start_layer + layers_per_partition
 
-    if pp_rank == pp_size - 1:
-        end_layer = num_hidden_layers
+        if pp_rank == pp_size - 1:
+            end_layer = num_hidden_layers
 
     return (start_layer, end_layer)
diff --git a/vllm/envs.py b/vllm/envs.py
index f06b6d66ea6f4..aef7ac385ec66 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -28,6 +28,7 @@
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
+    VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
     VLLM_OPENVINO_KVCACHE_SPACE: int = 0
@@ -242,6 +243,10 @@ def get_default_config_root():
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
+    # Pipeline stage partition strategy
+    "VLLM_PP_LAYER_PARTITION":
+    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
+
     # (CPU backend only) CPU key-value cache space.
     # default is 4GB
     "VLLM_CPU_KVCACHE_SPACE":

From 460c1884e3cb781730f85cb5591a85d5864bdac8 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 31 Jul 2024 15:47:46 -0400
Subject: [PATCH 108/167] [Bugfix] Support cpu offloading with fp8 quantization
 (#6960)

---
 tests/basic_correctness/test_cpu_offload.py | 43 +++++++++++++---
 vllm/model_executor/model_loader/loader.py  | 56 +++++++++++++++++++--
 vllm/model_executor/models/utils.py         | 50 +++++++++---------
 3 files changed, 116 insertions(+), 33 deletions(-)

diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 9ebcc48a9b93e..180b926637ecb 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,4 +1,6 @@
-from vllm.utils import is_hip
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
 
 from ..utils import compare_two_settings
 
@@ -6,8 +8,37 @@
 def test_cpu_offload():
     compare_two_settings("meta-llama/Llama-2-7b-hf", [],
                          ["--cpu-offload-gb", "4"])
-    if not is_hip():
-        # compressed-tensors quantization is currently not supported in ROCm.
-        compare_two_settings(
-            "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", [],
-            ["--cpu-offload-gb", "1"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+def test_cpu_offload_fp8():
+    # Test quantization of an unquantized checkpoint
+    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
+                         ["--quantization", "fp8"],
+                         ["--quantization", "fp8", "--cpu-offload-gb", "2"])
+    # Test loading a quantized checkpoint
+    compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
+                         ["--cpu-offload-gb", "2"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("awq"),
+                    reason="awq is not supported on this GPU type.")
+def test_cpu_offload_awq():
+    compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
+                         ["--cpu-offload-gb", "2"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_compressed_tensors():
+    # Test wNa16
+    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
+                         ["--cpu-offload-gb", "1"])
+    # Test w4a16_marlin24
+    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+                         [], ["--cpu-offload-gb", "1"])
+    # Test w8a8
+    compare_two_settings(
+        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
+        ["--cpu-offload-gb", "1"])
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index bbe49655020da..f72515e014829 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -7,6 +7,7 @@
 import math
 import os
 from abc import ABC, abstractmethod
+from contextlib import contextmanager
 from typing import Any, Dict, Generator, List, Optional, Tuple, Type
 
 import huggingface_hub
@@ -37,7 +38,49 @@
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_tpu
+from vllm.utils import is_pin_memory_available, is_tpu
+
+
+@contextmanager
+def device_loading_context(module: torch.nn.Module,
+                           target_device: torch.device):
+    if target_device.type == "cpu":
+        # If target is CPU, no need to move anything
+        yield module
+        return
+
+    original_device_states: Dict[str, torch.device] = {}
+
+    # Store original device states and move parameters to GPU if they're on CPU
+    for name, p in module.named_parameters():
+        if p.device.type == "cpu":
+            original_device_states[name] = p.device
+            p.data = p.data.to(target_device)
+        # Parameters already on target device are not touched
+
+    try:
+        yield module
+
+    finally:
+        # Restore parameters to their original devices, ignoring new parameters
+        pin_memory = is_pin_memory_available()
+        for name, p in module.named_parameters():
+            if name in original_device_states:
+                original_device: torch.device = original_device_states[name]
+                if original_device.type == "cpu":
+                    # `torch.empty_like` does not support `pin_memory` argument
+                    cpu_data = torch.empty_strided(size=p.data.size(),
+                                                   stride=p.data.stride(),
+                                                   dtype=p.data.dtype,
+                                                   layout=p.data.layout,
+                                                   device="cpu",
+                                                   pin_memory=pin_memory)
+                    cpu_data.copy_(p.data)
+                    p.data = cpu_data
+                else:
+                    p.data = p.data.to(original_device)
+        # New parameters or parameters already on target device are untouched
+
 
 logger = init_logger(__name__)
 
@@ -275,8 +318,9 @@ def load_model(self, *, model_config: ModelConfig,
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
+        target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with target_device:
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, multimodal_config,
                                           cache_config, scheduler_config)
@@ -291,7 +335,13 @@ def load_model(self, *, model_config: ModelConfig,
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:
-                    quant_method.process_weights_after_loading(module)
+                    # When quant methods need to process weights after loading
+                    # (for repacking, quantizing, etc), they expect parameters
+                    # to be on the global target device. This scope is for the
+                    # case where cpu offloading is used, where we will move the
+                    # parameters onto device for processing and back off after.
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
         return model.eval()
 
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 197d3839a766a..91b4a27814bf4 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -87,6 +87,7 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
 
     # offload parameters to CPU
     # use pin_memory if possible, which helps cudagraph capture speed
+    offloaded_parameters = False
     for p in module.parameters():
         if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
             # we use per-parameter offloading
@@ -94,35 +95,36 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
             break
 
         # `torch.empty_like` does not support `pin_memory` argument
-        cpu_data = torch.empty(size=p.data.size(),
-                               dtype=p.data.dtype,
-                               layout=p.data.layout,
-                               device='cpu',
-                               pin_memory=pin_memory)
+        cpu_data = torch.empty_strided(size=p.data.size(),
+                                       stride=p.data.stride(),
+                                       dtype=p.data.dtype,
+                                       layout=p.data.layout,
+                                       device='cpu',
+                                       pin_memory=pin_memory)
         cpu_data.copy_(p.data)
         p.data = cpu_data
         _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
+        offloaded_parameters = True
+
+    if offloaded_parameters:
+        original_forward = module.forward
+
+        def forward(*args, **kwargs):
+            module.forward = original_forward
+            device_state = {
+                # here we blindly call `to(device)`
+                # if the parameter is already on the device, it will be a no-op
+                k: v.to(device, non_blocking=True)
+                for k, v in module.state_dict().items()
+            }
+            output = functional_call(module,
+                                     device_state,
+                                     args=args,
+                                     kwargs=kwargs)
+            module.forward = forward
+            return output
 
-    state_dict: Dict[str, torch.Tensor] = module.state_dict()
-
-    original_forward = module.forward
-
-    def forward(*args, **kwargs):
-        module.forward = original_forward
-        device_state = {
-            # here we blindly call `to(device)`
-            # if the parameter is already on the device, it will be a no-op
-            k: v.to(device, non_blocking=True)
-            for k, v in state_dict.items()
-        }
-        output = functional_call(module,
-                                 device_state,
-                                 args=args,
-                                 kwargs=kwargs)
         module.forward = forward
-        return output
-
-    module.forward = forward
 
     return module
 

From 93548eb37e952a0af035dc524a3826cdcd78d6cf Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 31 Jul 2024 17:40:22 -0400
Subject: [PATCH 109/167] [Kernel] Enable FP8 Cutlass for Ada Lovelace (#6950)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 605166930ccc6..8d4d94ca08452 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -38,13 +38,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   if (cuda_device_capability >= 90) {
     return CUDA_VERSION >= 12000;
   } else if (cuda_device_capability >= 89) {
-    // CUTLASS Kernels have not been tuned for Ada Lovelace systems
-    // and are slower than torch.mm. Return false unconditionally in this case.
-    return false;
-
-    // Once the CUTLASS kernels have been optimized for Lovelace systems,
-    // use the following check:
-    // return CUDA_VERSION >= 12040;
+    return CUDA_VERSION >= 12040;
   }
 #endif
 

From 35e9c12bfaf8f273281af897b7208dfba53f103c Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 31 Jul 2024 17:40:32 -0400
Subject: [PATCH 110/167] [Kernel] Tuned int8 Cutlass Kernels for SM75 (T4)
 (#6996)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_benchmarks/w8a8_benchmarks.py     |   9 +-
 .../cutlass_w8a8/scaled_mm_c2x.cu             |  15 +--
 .../scaled_mm_c2x_sm75_dispatch.cuh           | 123 ++++++++++++++++++
 3 files changed, 135 insertions(+), 12 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 70247e94e63cf..64011b2db2395 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -112,13 +112,20 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
 
     timers = []
-    # pytorch impl
+    # pytorch impl - bfloat16
     timers.append(
         bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
                  b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
                  torch.bfloat16, label, sub_label, pytorch_mm_impl,
                  "pytorch_bf16_bf16_bf16_matmul-no-scales"))
 
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(a.to(dtype=torch.float16, device="cuda"),
+                 b.to(dtype=torch.float16, device="cuda"), scale_a, scale_b,
+                 torch.float16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales"))
+
     # cutlass impl
     timers.append(
         bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index aac4900f933a7..8d0dfee7bf23a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -3,6 +3,7 @@
 #include "cutlass/cutlass.h"
 
 #include "scaled_mm_c2x.cuh"
+#include "scaled_mm_c2x_sm75_dispatch.cuh"
 #include "scaled_mm_c2x_sm80_dispatch.cuh"
 #include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
 #include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
@@ -20,21 +21,13 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(a.dtype() == torch::kInt8);
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
-
   if (out.dtype() == torch::kBFloat16) {
-    return vllm::cutlass_gemm_caller<
-        vllm::cutlass_2x_gemm<cutlass::arch::Sm75, vllm::enable_sm75_to_sm80,
-                              int8_t, cutlass::bfloat16_t, Epilogue, TileShape,
-                              WarpShape, InstructionShape, 2>>(
+    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t,
+                                            Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return vllm::cutlass_gemm_caller<vllm::cutlass_2x_gemm<
-        cutlass::arch::Sm75, vllm::enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
new file mode 100644
index 0000000000000..a562fd896e54d
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM75 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_default {
+  // This config is used in 2 cases,
+  // - M in (256, inf]
+  // - M in (64, 128]
+  // Shared memory required by this Gemm 32768
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M256 {
+  // M in (128, 256]
+  // Shared memory required by this Gemm 65536
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M64 {
+  // M in (32, 64]
+  // Shared memory required by this Gemm 49152
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M32 {
+  // M in [1, 32]
+  // Shared memory required by this Gemm 49152
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm75_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm75_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM256 =
+      typename sm75_config_M256<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128 = Cutlass2xGemmDefault;
+  using Cutlass2xGemmM64 =
+      typename sm75_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm75_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm75_config_default has the least shared-memory requirements.
+  using FallbackGemm = Cutlass2xGemmDefault;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+  if (mp2 <= 32) {
+    // M in [1, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM128, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM256, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm

From a0dce9383ab7de0015060fb9fedadeb7d8ffdfb9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 31 Jul 2024 17:40:44 -0400
Subject: [PATCH 111/167] [Misc] Add compressed-tensors to optimized quant list
 (#7006)

---
 vllm/config.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index fd48cc3a6b371..de5d0402a1bc7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -197,13 +197,17 @@ def _verify_embedding_mode(self) -> None:
     def _parse_quant_hf_config(self):
         quant_cfg = getattr(self.hf_config, "quantization_config", None)
         if quant_cfg is None:
-            # compress-tensors uses a "compression_config" key
+            # compressed-tensors uses a "compression_config" key
             quant_cfg = getattr(self.hf_config, "compression_config", None)
         return quant_cfg
 
     def _verify_quantization(self) -> None:
         supported_quantization = [*QUANTIZATION_METHODS]
         rocm_supported_quantization = ["gptq", "squeezellm"]
+        optimized_quantization_methods = [
+            "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
+            "fbgemm_fp8", "compressed_tensors", "compressed-tensors"
+        ]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -242,9 +246,7 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not "
                     f"supported in ROCm.")
-            if (self.quantization
-                    not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin",
-                            "awq_marlin", "fbgemm_fp8", "compressed_tensors")):
+            if self.quantization not in optimized_quantization_methods:
                 logger.warning(
                     "%s quantization is not fully "
                     "optimized yet. The speed can be slower than "

From 7eb0cb4a14ff3de84bf18fad8054d12ea8000c22 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 31 Jul 2024 16:34:26 -0700
Subject: [PATCH 112/167] Revert "[Frontend] Factor out code for running
 uvicorn" (#7012)

Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
---
 pyproject.toml                        |  1 -
 vllm/entrypoints/api_server.py        | 74 +++++++++------------------
 vllm/entrypoints/openai/api_server.py | 72 ++++++++++++++++++--------
 vllm/server/__init__.py               |  3 --
 vllm/server/launch.py                 | 42 ---------------
 5 files changed, 75 insertions(+), 117 deletions(-)
 delete mode 100644 vllm/server/__init__.py
 delete mode 100644 vllm/server/launch.py

diff --git a/pyproject.toml b/pyproject.toml
index cd5d196a16200..b0d115a091c45 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,7 +60,6 @@ files = [
     "vllm/logging",
     "vllm/multimodal",
     "vllm/platforms",
-    "vllm/server",
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 3476357658522..66941442c8c9c 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -5,12 +5,12 @@
 We are also not going to accept PRs modifying this file, please
 change `vllm/entrypoints/openai/api_server.py` instead.
 """
-import asyncio
+
 import json
 import ssl
-from argparse import Namespace
-from typing import Any, AsyncGenerator, Optional
+from typing import AsyncGenerator
 
+import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
@@ -18,10 +18,8 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.server import serve_http
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
-from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
 
@@ -83,50 +81,6 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     return JSONResponse(ret)
 
 
-def build_app(args: Namespace) -> FastAPI:
-    global app
-
-    app.root_path = args.root_path
-    return app
-
-
-async def init_app(
-    args: Namespace,
-    llm_engine: Optional[AsyncLLMEngine] = None,
-) -> FastAPI:
-    app = build_app(args)
-
-    global engine
-
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = (llm_engine
-              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
-                  engine_args, usage_context=UsageContext.API_SERVER))
-
-    return app
-
-
-async def run_server(args: Namespace,
-                     llm_engine: Optional[AsyncLLMEngine] = None,
-                     **uvicorn_kwargs: Any) -> None:
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    logger.info("args: %s", args)
-
-    app = await init_app(args, llm_engine)
-    await serve_http(
-        app,
-        host=args.host,
-        port=args.port,
-        log_level=args.log_level,
-        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
-        ssl_keyfile=args.ssl_keyfile,
-        ssl_certfile=args.ssl_certfile,
-        ssl_ca_certs=args.ssl_ca_certs,
-        ssl_cert_reqs=args.ssl_cert_reqs,
-        **uvicorn_kwargs,
-    )
-
-
 if __name__ == "__main__":
     parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
@@ -151,5 +105,25 @@ async def run_server(args: Namespace,
     parser.add_argument("--log-level", type=str, default="debug")
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.API_SERVER)
+
+    app.root_path = args.root_path
 
-    asyncio.run(run_server(args))
+    logger.info("Available routes are:")
+    for route in app.routes:
+        if not hasattr(route, 'methods'):
+            continue
+        methods = ', '.join(route.methods)
+        logger.info("Route: %s, Methods: %s", route.path, methods)
+
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level=args.log_level,
+                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+                ssl_keyfile=args.ssl_keyfile,
+                ssl_certfile=args.ssl_certfile,
+                ssl_ca_certs=args.ssl_ca_certs,
+                ssl_cert_reqs=args.ssl_cert_reqs)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index c1640a10a407d..0fe4dd245b5e6 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,12 +2,14 @@
 import importlib
 import inspect
 import re
-from argparse import Namespace
+import signal
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Any, Optional, Set
+from typing import Optional, Set
 
-from fastapi import APIRouter, FastAPI, Request
+import fastapi
+import uvicorn
+from fastapi import APIRouter, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -36,7 +38,6 @@
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.logger import init_logger
-from vllm.server import serve_http
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
@@ -56,7 +57,7 @@
 
 
 @asynccontextmanager
-async def lifespan(app: FastAPI):
+async def lifespan(app: fastapi.FastAPI):
 
     async def _force_log():
         while True:
@@ -74,7 +75,7 @@ async def _force_log():
 router = APIRouter()
 
 
-def mount_metrics(app: FastAPI):
+def mount_metrics(app: fastapi.FastAPI):
     # Add prometheus asgi middleware to route /metrics requests
     metrics_route = Mount("/metrics", make_asgi_app())
     # Workaround for 307 Redirect for /metrics
@@ -164,8 +165,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-def build_app(args: Namespace) -> FastAPI:
-    app = FastAPI(lifespan=lifespan)
+def build_app(args):
+    app = fastapi.FastAPI(lifespan=lifespan)
     app.include_router(router)
     app.root_path = args.root_path
 
@@ -213,8 +214,11 @@ async def authentication(request: Request, call_next):
     return app
 
 
-async def init_app(args: Namespace,
-                   llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI:
+async def build_server(
+    args,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+    **uvicorn_kwargs,
+) -> uvicorn.Server:
     app = build_app(args)
 
     if args.served_model_name is not None:
@@ -277,17 +281,14 @@ async def init_app(args: Namespace,
     )
     app.root_path = args.root_path
 
-    return app
-
-
-async def run_server(args: Namespace,
-                     llm_engine: Optional[AsyncLLMEngine] = None,
-                     **uvicorn_kwargs: Any) -> None:
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    logger.info("args: %s", args)
+    logger.info("Available routes are:")
+    for route in app.routes:
+        if not hasattr(route, 'methods'):
+            continue
+        methods = ', '.join(route.methods)
+        logger.info("Route: %s, Methods: %s", route.path, methods)
 
-    app = await init_app(args, llm_engine)
-    await serve_http(
+    config = uvicorn.Config(
         app,
         host=args.host,
         port=args.port,
@@ -300,6 +301,36 @@ async def run_server(args: Namespace,
         **uvicorn_kwargs,
     )
 
+    return uvicorn.Server(config)
+
+
+async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    server = await build_server(
+        args,
+        llm_engine,
+        **uvicorn_kwargs,
+    )
+
+    loop = asyncio.get_running_loop()
+
+    server_task = loop.create_task(server.serve())
+
+    def signal_handler() -> None:
+        # prevents the uvicorn signal handler to exit early
+        server_task.cancel()
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+    except asyncio.CancelledError:
+        print("Gracefully stopping http server")
+        await server.shutdown()
+
 
 if __name__ == "__main__":
     # NOTE(simon):
@@ -308,5 +339,4 @@ async def run_server(args: Namespace,
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
     args = parser.parse_args()
-
     asyncio.run(run_server(args))
diff --git a/vllm/server/__init__.py b/vllm/server/__init__.py
deleted file mode 100644
index 17c98b4dad6c9..0000000000000
--- a/vllm/server/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .launch import serve_http
-
-__all__ = ["serve_http"]
diff --git a/vllm/server/launch.py b/vllm/server/launch.py
deleted file mode 100644
index 1a8aeb7f1022b..0000000000000
--- a/vllm/server/launch.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import asyncio
-import signal
-from typing import Any
-
-import uvicorn
-from fastapi import FastAPI
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-async def serve_http(app: FastAPI, **uvicorn_kwargs: Any) -> None:
-    logger.info("Available routes are:")
-    for route in app.routes:
-        methods = getattr(route, "methods", None)
-        path = getattr(route, "path", None)
-
-        if methods is None or path is None:
-            continue
-
-        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
-
-    config = uvicorn.Config(app, **uvicorn_kwargs)
-    server = uvicorn.Server(config)
-
-    loop = asyncio.get_running_loop()
-
-    server_task = loop.create_task(server.serve())
-
-    def signal_handler() -> None:
-        # prevents the uvicorn signal handler to exit early
-        server_task.cancel()
-
-    loop.add_signal_handler(signal.SIGINT, signal_handler)
-    loop.add_signal_handler(signal.SIGTERM, signal_handler)
-
-    try:
-        await server_task
-    except asyncio.CancelledError:
-        logger.info("Gracefully stopping http server")
-        await server.shutdown()

From 7ecee3432110bae563c8756a66b54e5f08dc777d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 1 Aug 2024 08:12:24 +0800
Subject: [PATCH 113/167] [Kernel][RFC] Refactor the punica kernel based on
 Triton (#5036)

---
 .github/workflows/scripts/build.sh           |    2 -
 CMakeLists.txt                               |   62 -
 Dockerfile                                   |    2 -
 Dockerfile.rocm                              |    3 +-
 csrc/punica/LICENSE                          |  217 ---
 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_config.h               |  218 ---
 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_impl.cuh               |  451 ------
 csrc/punica/bgmv/generator.py                |   48 -
 csrc/punica/bgmv/vec_dtypes.cuh              | 1325 ------------------
 csrc/punica/punica_ops.cu                    |  569 --------
 csrc/punica/punica_ops.h                     |   11 -
 csrc/punica/torch_bindings.cpp               |   18 -
 csrc/punica/type_convert.h                   |   82 --
 docs/source/getting_started/installation.rst |    1 -
 setup.py                                     |   10 -
 tests/kernels/test_sampler.py                |   48 +-
 tests/lora/test_gemma.py                     |    2 +-
 tests/lora/test_layers.py                    |  140 +-
 tests/lora/test_lora.py                      |  224 ---
 tests/lora/test_punica.py                    |  258 ----
 tests/lora/test_punica_sizes.py              |  408 ++++++
 tests/lora/test_punica_variation.py          |  342 +++++
 tests/lora/test_quant_model.py               |   48 +-
 tests/lora/utils.py                          |  148 ++
 vllm/_custom_ops.py                          |   42 +-
 vllm/envs.py                                 |    5 -
 vllm/lora/fully_sharded_layers.py            |  137 +-
 vllm/lora/layers.py                          |  437 ++----
 vllm/lora/models.py                          |  171 +--
 vllm/lora/ops/__init__.py                    |    0
 vllm/lora/ops/bgmv_expand.py                 |  169 +++
 vllm/lora/ops/bgmv_expand_slice.py           |  182 +++
 vllm/lora/ops/bgmv_shrink.py                 |  150 ++
 vllm/lora/ops/sgmv_expand.py                 |  192 +++
 vllm/lora/ops/sgmv_expand_slice.py           |  205 +++
 vllm/lora/ops/sgmv_shrink.py                 |  189 +++
 vllm/lora/ops/utils.py                       |   46 +
 vllm/lora/punica.py                          |  765 +++++++---
 vllm/triton_utils/__init__.py                |    3 +-
 vllm/triton_utils/libentry.py                |  167 +++
 vllm/worker/model_runner.py                  |   12 +-
 47 files changed, 3175 insertions(+), 4364 deletions(-)
 delete mode 100644 csrc/punica/LICENSE
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_config.h
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_impl.cuh
 delete mode 100644 csrc/punica/bgmv/generator.py
 delete mode 100644 csrc/punica/bgmv/vec_dtypes.cuh
 delete mode 100644 csrc/punica/punica_ops.cu
 delete mode 100644 csrc/punica/punica_ops.h
 delete mode 100644 csrc/punica/torch_bindings.cpp
 delete mode 100644 csrc/punica/type_convert.h
 delete mode 100644 tests/lora/test_lora.py
 delete mode 100644 tests/lora/test_punica.py
 create mode 100644 tests/lora/test_punica_sizes.py
 create mode 100644 tests/lora/test_punica_variation.py
 create mode 100644 vllm/lora/ops/__init__.py
 create mode 100644 vllm/lora/ops/bgmv_expand.py
 create mode 100644 vllm/lora/ops/bgmv_expand_slice.py
 create mode 100644 vllm/lora/ops/bgmv_shrink.py
 create mode 100644 vllm/lora/ops/sgmv_expand.py
 create mode 100644 vllm/lora/ops/sgmv_expand_slice.py
 create mode 100644 vllm/lora/ops/sgmv_shrink.py
 create mode 100644 vllm/lora/ops/utils.py
 create mode 100644 vllm/triton_utils/libentry.py

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 60a3978f9abd7..0a759d303238b 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -13,8 +13,6 @@ $python_executable -m pip install -r requirements-cuda.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
-# Make sure punica is built for the release (for LoRA)
-export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28b8879a7ba17..0d599c5470704 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -223,61 +223,7 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-#
-# _punica_C extension
-#
-
-set(VLLM_PUNICA_EXT_SRC
-  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
-  "csrc/punica/punica_ops.cu"
-  "csrc/punica/torch_bindings.cpp")
-
-#
-# Copy GPU compilation flags+update for punica
-#
-set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
-list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
-  "-D__CUDA_NO_HALF_OPERATORS__"
-  "-D__CUDA_NO_HALF_CONVERSIONS__"
-  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
-  "-D__CUDA_NO_HALF2_OPERATORS__")
-
-#
-# Filter out CUDA architectures < 8.0 for punica.
-#
-if (${VLLM_GPU_LANG} STREQUAL "CUDA")
-  set(VLLM_PUNICA_GPU_ARCHES)
-  foreach(ARCH ${VLLM_GPU_ARCHES})
-    string_to_ver(CODE_VER ${ARCH})
-    if (CODE_VER GREATER_EQUAL 8.0)
-      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
-    endif()
-  endforeach()
-  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
-elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
-  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
-  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
-endif()
 
-if (VLLM_PUNICA_GPU_ARCHES)
-  define_gpu_extension_target(
-    _punica_C
-    DESTINATION vllm
-    LANGUAGE ${VLLM_GPU_LANG}
-    SOURCES ${VLLM_PUNICA_EXT_SRC}
-    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
-    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
-    USE_SABI 3
-    WITH_SOABI)
-else()
-  message(WARNING "Unable to create _punica_C target because none of the "
-    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
-endif()
 
 #
 # Add the `default` target which detects which extensions should be
@@ -301,12 +247,4 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling moe extension.")
   add_dependencies(default _moe_C)
 
-  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
-  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
-  # there are supported target arches.
-  if (VLLM_PUNICA_GPU_ARCHES AND
-      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
-    message(STATUS "Enabling punica extension.")
-    add_dependencies(default _punica_C)
-  endif()
 endif()
diff --git a/Dockerfile b/Dockerfile
index b9a56e67e8d7b..db4453ab0efc9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -88,8 +88,6 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 ARG buildkite_commit
 ENV BUILDKITE_COMMIT=${buildkite_commit}
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 64bc0f3c12c75..33423fde4ff96 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -131,8 +131,7 @@ COPY . .
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
 
-# Make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 # Silences the HF Tokenizers warning
diff --git a/csrc/punica/LICENSE b/csrc/punica/LICENSE
deleted file mode 100644
index a46e2cdcadf7d..0000000000000
--- a/csrc/punica/LICENSE
+++ /dev/null
@@ -1,217 +0,0 @@
-Contains code from https://github.com/punica-ai/punica
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-------------------------------------------------------------------------------------
-
-This product bundles various third-party components under other open source licenses.
-This section summarizes those components and their licenses. See licenses/
-for text of these licenses.
-
-
-Apache-2.0
-* third_party/nvbench (with LLVM exception)
-* third_party/flashinfer
-
-BSD-3-Clause:
-* third_party/cutlass
\ No newline at end of file
diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
deleted file mode 100644
index 86846c274c90f..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
deleted file mode 100644
index de39c3121f5d3..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
deleted file mode 100644
index 2c8d007d8719f..0000000000000
--- a/csrc/punica/bgmv/bgmv_config.h
+++ /dev/null
@@ -1,218 +0,0 @@
-#pragma once
-
-template <int feat_in, int feat_out, typename in_T, typename out_T,
-          typename W_T>
-void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                 const W_T *__restrict__ W,
-                 const int64_t *__restrict__ indicies, int64_t y_offset,
-                 int64_t full_y_size, int64_t batch_size, int64_t num_layers,
-                 int64_t layer_idx, float scale);
-
-// clang-format off
-
-#define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \
-    f(in_T, out_T, W_T, narrow, 128) \
-    f(in_T, out_T, W_T, narrow, 256) \
-    f(in_T, out_T, W_T, narrow, 512) \
-    f(in_T, out_T, W_T, narrow, 640) \
-    f(in_T, out_T, W_T, narrow, 768) \
-    f(in_T, out_T, W_T, narrow, 896) \
-    f(in_T, out_T, W_T, narrow, 1024) \
-    f(in_T, out_T, W_T, narrow, 1152) \
-    f(in_T, out_T, W_T, narrow, 1216) \
-    f(in_T, out_T, W_T, narrow, 1280) \
-    f(in_T, out_T, W_T, narrow, 1536) \
-    f(in_T, out_T, W_T, narrow, 1664) \
-    f(in_T, out_T, W_T, narrow, 1728) \
-    f(in_T, out_T, W_T, narrow, 1792) \
-    f(in_T, out_T, W_T, narrow, 2048) \
-    f(in_T, out_T, W_T, narrow, 2240) \
-    f(in_T, out_T, W_T, narrow, 2304) \
-    f(in_T, out_T, W_T, narrow, 2368) \
-    f(in_T, out_T, W_T, narrow, 2432) \
-    f(in_T, out_T, W_T, narrow, 2560) \
-    f(in_T, out_T, W_T, narrow, 2752) \
-    f(in_T, out_T, W_T, narrow, 2816) \
-    f(in_T, out_T, W_T, narrow, 3072) \
-    f(in_T, out_T, W_T, narrow, 3328) \
-    f(in_T, out_T, W_T, narrow, 3456) \
-    f(in_T, out_T, W_T, narrow, 3584) \
-    f(in_T, out_T, W_T, narrow, 3712) \
-    f(in_T, out_T, W_T, narrow, 4096) \
-    f(in_T, out_T, W_T, narrow, 4480) \
-    f(in_T, out_T, W_T, narrow, 4608) \
-    f(in_T, out_T, W_T, narrow, 4736) \
-    f(in_T, out_T, W_T, narrow, 4864) \
-    f(in_T, out_T, W_T, narrow, 5120) \
-    f(in_T, out_T, W_T, narrow, 5504) \
-    f(in_T, out_T, W_T, narrow, 5632) \
-    f(in_T, out_T, W_T, narrow, 5888) \
-    f(in_T, out_T, W_T, narrow, 6144) \
-    f(in_T, out_T, W_T, narrow, 6400) \
-    f(in_T, out_T, W_T, narrow, 6848) \
-    f(in_T, out_T, W_T, narrow, 6912) \
-    f(in_T, out_T, W_T, narrow, 7168) \
-    f(in_T, out_T, W_T, narrow, 7424) \
-    f(in_T, out_T, W_T, narrow, 8192) \
-    f(in_T, out_T, W_T, narrow, 8960) \
-    f(in_T, out_T, W_T, narrow, 9216) \
-    f(in_T, out_T, W_T, narrow, 9472) \
-    f(in_T, out_T, W_T, narrow, 10240) \
-    f(in_T, out_T, W_T, narrow, 11008) \
-    f(in_T, out_T, W_T, narrow, 11264) \
-    f(in_T, out_T, W_T, narrow, 12288) \
-    f(in_T, out_T, W_T, narrow, 13696) \
-    f(in_T, out_T, W_T, narrow, 13824) \
-    f(in_T, out_T, W_T, narrow, 14336) \
-    f(in_T, out_T, W_T, narrow, 14784) \
-    f(in_T, out_T, W_T, narrow, 14848) \
-    f(in_T, out_T, W_T, narrow, 15360) \
-    f(in_T, out_T, W_T, narrow, 16384) \
-    f(in_T, out_T, W_T, narrow, 18944) \
-    f(in_T, out_T, W_T, narrow, 20480) \
-    f(in_T, out_T, W_T, narrow, 22016) \
-    f(in_T, out_T, W_T, narrow, 22528) \
-    f(in_T, out_T, W_T, narrow, 24576) \
-    f(in_T, out_T, W_T, narrow, 27392) \
-    f(in_T, out_T, W_T, narrow, 27648) \
-    f(in_T, out_T, W_T, narrow, 28672) \
-    f(in_T, out_T, W_T, narrow, 29568) \
-    f(in_T, out_T, W_T, narrow, 29696) \
-    f(in_T, out_T, W_T, narrow, 32000) \
-    f(in_T, out_T, W_T, narrow, 32256) \
-    f(in_T, out_T, W_T, narrow, 32512) \
-    f(in_T, out_T, W_T, narrow, 32768) \
-    f(in_T, out_T, W_T, narrow, 33024) \
-    f(in_T, out_T, W_T, narrow, 36864) \
-    f(in_T, out_T, W_T, narrow, 43264) \
-    f(in_T, out_T, W_T, narrow, 49152) \
-    f(in_T, out_T, W_T, narrow, 49408) \
-    f(in_T, out_T, W_T, narrow, 60544) \
-    f(in_T, out_T, W_T, narrow, 60672) \
-    f(in_T, out_T, W_T, narrow, 64000) \
-    f(in_T, out_T, W_T, narrow, 64256) \
-    f(in_T, out_T, W_T, narrow, 64512) \
-    f(in_T, out_T, W_T, narrow, 102400) \
-    f(in_T, out_T, W_T, narrow, 102656) \
-    f(in_T, out_T, W_T, narrow, 102912) \
-    f(in_T, out_T, W_T, narrow, 128000) \
-    f(in_T, out_T, W_T, narrow, 128256) \
-    f(in_T, out_T, W_T, narrow, 128512) \
-    
-    
-// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
-// and vllm/tests/lora/test_punica.py
-
-// Used for defining kernels going from the variety of
-// dim in to the narrow dim out
-    // Using it for the fully sharded column
-    // parallel LoRA A which splits the rank dim
-#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
-    f(in_T, out_T, W_T, 128, narrow) \
-    f(in_T, out_T, W_T, 256, narrow) \
-    f(in_T, out_T, W_T, 512, narrow) \
-    f(in_T, out_T, W_T, 640, narrow) \
-    f(in_T, out_T, W_T, 768, narrow) \
-    f(in_T, out_T, W_T, 896, narrow) \
-    f(in_T, out_T, W_T, 1024, narrow) \
-    f(in_T, out_T, W_T, 1152, narrow) \
-    f(in_T, out_T, W_T, 1216, narrow) \
-    f(in_T, out_T, W_T, 1280, narrow) \
-    f(in_T, out_T, W_T, 1536, narrow) \
-    f(in_T, out_T, W_T, 1664, narrow) \
-    f(in_T, out_T, W_T, 1728, narrow) \
-    f(in_T, out_T, W_T, 1792, narrow) \
-    f(in_T, out_T, W_T, 2048, narrow) \
-    f(in_T, out_T, W_T, 2240, narrow) \
-    f(in_T, out_T, W_T, 2304, narrow) \
-    f(in_T, out_T, W_T, 2368, narrow) \
-    f(in_T, out_T, W_T, 2432, narrow) \
-    f(in_T, out_T, W_T, 2560, narrow) \
-    f(in_T, out_T, W_T, 2752, narrow) \
-    f(in_T, out_T, W_T, 2816, narrow) \
-    f(in_T, out_T, W_T, 3072, narrow) \
-    f(in_T, out_T, W_T, 3328, narrow) \
-    f(in_T, out_T, W_T, 3456, narrow) \
-    f(in_T, out_T, W_T, 3584, narrow) \
-    f(in_T, out_T, W_T, 3712, narrow) \
-    f(in_T, out_T, W_T, 4096, narrow) \
-    f(in_T, out_T, W_T, 4480, narrow) \
-    f(in_T, out_T, W_T, 4608, narrow) \
-    f(in_T, out_T, W_T, 4736, narrow) \
-    f(in_T, out_T, W_T, 4864, narrow) \
-    f(in_T, out_T, W_T, 5120, narrow) \
-    f(in_T, out_T, W_T, 5504, narrow) \
-    f(in_T, out_T, W_T, 5632, narrow) \
-    f(in_T, out_T, W_T, 5888, narrow) \
-    f(in_T, out_T, W_T, 6144, narrow) \
-    f(in_T, out_T, W_T, 6400, narrow) \
-    f(in_T, out_T, W_T, 6848, narrow) \
-    f(in_T, out_T, W_T, 6912, narrow) \
-    f(in_T, out_T, W_T, 7168, narrow) \
-    f(in_T, out_T, W_T, 7424, narrow) \
-    f(in_T, out_T, W_T, 8192, narrow) \
-    f(in_T, out_T, W_T, 8960, narrow) \
-    f(in_T, out_T, W_T, 9216, narrow) \
-    f(in_T, out_T, W_T, 9472, narrow) \
-    f(in_T, out_T, W_T, 10240, narrow) \
-    f(in_T, out_T, W_T, 11008, narrow) \
-    f(in_T, out_T, W_T, 11264, narrow) \
-    f(in_T, out_T, W_T, 12288, narrow) \
-    f(in_T, out_T, W_T, 13696, narrow) \
-    f(in_T, out_T, W_T, 13824, narrow) \
-    f(in_T, out_T, W_T, 14336, narrow) \
-    f(in_T, out_T, W_T, 14784, narrow) \
-    f(in_T, out_T, W_T, 14848, narrow) \
-    f(in_T, out_T, W_T, 15360, narrow) \
-    f(in_T, out_T, W_T, 16384, narrow) \
-    f(in_T, out_T, W_T, 18944, narrow) \
-    f(in_T, out_T, W_T, 20480, narrow) \
-    f(in_T, out_T, W_T, 22016, narrow) \
-    f(in_T, out_T, W_T, 22528, narrow) \
-    f(in_T, out_T, W_T, 24576, narrow) \
-    f(in_T, out_T, W_T, 27392, narrow) \
-    f(in_T, out_T, W_T, 27648, narrow) \
-    f(in_T, out_T, W_T, 28672, narrow) \
-    f(in_T, out_T, W_T, 29568, narrow) \
-    f(in_T, out_T, W_T, 29696, narrow) \
-    f(in_T, out_T, W_T, 32000, narrow) \
-    f(in_T, out_T, W_T, 32256, narrow) \
-    f(in_T, out_T, W_T, 32512, narrow) \
-    f(in_T, out_T, W_T, 32768, narrow) \
-    f(in_T, out_T, W_T, 33024, narrow) \
-    f(in_T, out_T, W_T, 36864, narrow) \
-    f(in_T, out_T, W_T, 43264, narrow) \
-    f(in_T, out_T, W_T, 49152, narrow) \
-    f(in_T, out_T, W_T, 49408, narrow) \
-    f(in_T, out_T, W_T, 60544, narrow) \
-    f(in_T, out_T, W_T, 60672, narrow) \
-    f(in_T, out_T, W_T, 64000, narrow) \
-    f(in_T, out_T, W_T, 64256, narrow) \
-    f(in_T, out_T, W_T, 64512, narrow) \
-    f(in_T, out_T, W_T, 102400, narrow) \
-    f(in_T, out_T, W_T, 102656, narrow) \
-    f(in_T, out_T, W_T, 102912, narrow) \
-    f(in_T, out_T, W_T, 128000, narrow) \
-    f(in_T, out_T, W_T, 128256, narrow) \
-    f(in_T, out_T, W_T, 128512, narrow) \
-// Keep above in sync with vllm/lora/layers::SamplerWithLoRA
-
-
-// Keep this in sync with vllm/config::LoRAConfig
-#define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8)  \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 16) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64)
-
-
-#define FOR_INST_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 1) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 2) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 4) \
-    f(in_T, out_T, W_T, 8, 64) \
-    f(in_T, out_T, W_T, 16, 64) \
-    f(in_T, out_T, W_T, 32, 64) \
-    f(in_T, out_T, W_T, 64, 64)
-
-// clang-format on
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
deleted file mode 100644
index d225a1eaa82b0..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
deleted file mode 100644
index b37d288a75561..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
deleted file mode 100644
index a1ab2deecbabf..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
deleted file mode 100644
index 0b35bf5699898..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_impl.cuh b/csrc/punica/bgmv/bgmv_impl.cuh
deleted file mode 100644
index 8a3b8403b4a6f..0000000000000
--- a/csrc/punica/bgmv/bgmv_impl.cuh
+++ /dev/null
@@ -1,451 +0,0 @@
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#ifndef USE_ROCM
-#include <cooperative_groups.h>
-#else
-#include <hip/hip_cooperative_groups.h>
-#endif
-#ifndef USE_ROCM
-#include <cuda/pipeline>
-#endif
-#include <cuda_runtime.h>
-#include <iostream>
-#include <stdio.h>
-
-#include "vec_dtypes.cuh"
-
-namespace cg = cooperative_groups;
-
-#ifdef USE_ROCM
-template <size_t len>
-__host__ __device__
-inline void* memcpy_blocking(void *dst, const void *src) {
-  // Does not handle the case of long datatypes
-  char *d = reinterpret_cast<char *>(dst);
-  const char *s = reinterpret_cast<const char *>(src);
-  size_t i = 0;
-#pragma unroll
-  for (i = 0; i < len; ++i) {
-    d[i] = s[i];
-  }
-  return dst;
-}
-#endif
-
-#ifndef USE_ROCM
-
-// nthrs = (32, 4)
-template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
-          size_t W_copy_size, int tx, int ty, int tz, typename in_T,
-          typename out_T, typename W_T>
-__global__ void
-bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-  if (idx < 0) {
-    return;
-  }
-
-  auto block = cg::this_thread_block();
-  size_t j = blockIdx.x;
-  constexpr size_t num_pipeline_stages = 2;
-  constexpr size_t tile_size = tx * ty * vec_size;
-  __shared__ W_T W_shared[num_pipeline_stages * tile_size];
-  __shared__ in_T X_shared[num_pipeline_stages * tile_size];
-  __shared__ float y_warpwise[ty];
-
-  size_t W_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
-  size_t X_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
-  auto pipe = cuda::make_pipeline();
-
-  // pipeline load W/X and compute WX;
-  pipe.producer_acquire();
-  cuda::memcpy_async(W_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     W + (idx * feat_out + j) * feat_in +
-                         (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
-  cuda::memcpy_async(X_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     X + (batch_idx * feat_in) +
-                         (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
-  pipe.producer_commit();
-  size_t copy_idx, compute_idx;
-  float y = 0.f;
-  vec_t<in_T, vec_size> x_vec;
-  vec_t<W_T, vec_size> w_vec;
-  size_t tile_idx;
-
-#pragma unroll
-  for (tile_idx = 1; tile_idx < (feat_in + tile_size - 1) / tile_size;
-       ++tile_idx) {
-    copy_idx = tile_idx % num_pipeline_stages;
-    // pipeline stage: async copy W fragment
-    pipe.producer_acquire();
-    if (tile_idx * tile_size + threadIdx.y * tx * vec_size < feat_in) {
-      cuda::memcpy_async(W_shared + W_shared_offset[copy_idx] +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         W + (idx * feat_out + j) * feat_in +
-                             tile_idx * tile_size +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
-      cuda::memcpy_async(X_shared + X_shared_offset[copy_idx] +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         X + (batch_idx * feat_in) + tile_idx * tile_size +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
-    }
-    pipe.producer_commit();
-
-    compute_idx = (tile_idx - 1) % num_pipeline_stages;
-    // pipeline stage: compute WX
-    pipe.consumer_wait();
-    block.sync();
-    x_vec.load(X_shared + X_shared_offset[compute_idx] +
-               (threadIdx.y * tx + threadIdx.x) * vec_size);
-    w_vec.load(W_shared + W_shared_offset[compute_idx] +
-               (threadIdx.y * tx + threadIdx.x) * vec_size);
-    float sum = 0.f;
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-    }
-#pragma unroll
-    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-      sum += __shfl_down_sync(0xffffffff, sum, offset);
-    }
-    y_warpwise[threadIdx.y] = sum;
-    block.sync();
-#pragma unroll
-    for (size_t i = 0; i < ty; ++i) {
-      y += y_warpwise[i];
-    }
-
-    block.sync();
-    pipe.consumer_release();
-  }
-
-  compute_idx = (tile_idx - 1) % num_pipeline_stages;
-  // final pipeline stage
-  pipe.consumer_wait();
-  block.sync();
-  x_vec.load(X_shared + X_shared_offset[compute_idx] +
-             (threadIdx.y * tx + threadIdx.x) * vec_size);
-  w_vec.load(W_shared + W_shared_offset[compute_idx] +
-             (threadIdx.y * tx + threadIdx.x) * vec_size);
-  float sum = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-  }
-#pragma unroll
-  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-    sum += __shfl_down_sync(0xffffffff, sum, offset);
-  }
-  y_warpwise[threadIdx.y] =
-      ((tile_idx - 1) * tile_size + threadIdx.y * tx * vec_size < feat_in)
-          ? sum
-          : 0.f;
-  block.sync();
-#pragma unroll
-  for (size_t i = 0; i < ty; ++i) {
-    y += y_warpwise[i];
-  }
-
-  block.sync();
-  pipe.consumer_release();
-
-  // write Y;
-  if (block.thread_rank() == 0) {
-    Y[batch_idx * full_y_size + y_offset + j] += static_cast<out_T>(y);
-  }
-}
-
-#else
-
-template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
-          size_t W_copy_size, int tx, int ty, int tz, typename in_T,
-          typename out_T, typename W_T>
-__global__ void
-bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-  if (idx < 0) {
-    return;
-  }
-
-  size_t j = blockIdx.x;
-  constexpr size_t tile_size = tx * ty * vec_size;
-  constexpr size_t num_tiles = (feat_in + tile_size - 1) / tile_size;
-  __shared__ float y_warpwise[ty];
-
-  float y = 0;
-  vec_t<in_T, vec_size> x_vec;
-  vec_t<W_T, vec_size> w_vec;
-  size_t tile_idx;
-
-#pragma unroll
-  for (tile_idx = 0; tile_idx < num_tiles; ++tile_idx) {
-    if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) {
-      x_vec.load(X + (batch_idx * feat_in) +
-                     tile_idx * tile_size +
-                     (threadIdx.y * tx + threadIdx.x) * vec_size);
-      w_vec.load(W + (idx * feat_out + j) * feat_in +
-                     tile_idx * tile_size +
-                     (threadIdx.y * tx + threadIdx.x) * vec_size);
-    }
-
-    float sum = 0.f;
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      sum += convert_type<W_T, float>(w_vec[i]) * convert_type<in_T, float>(x_vec[i]) * scale;
-    }
-#pragma unroll
-    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-      sum += VLLM_SHFL_DOWN_SYNC(sum, offset);
-    }
-
-    __syncthreads();
-
-    if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) {
-      y += sum;
-    }
-  }
-
-  if (threadIdx.x == 0) {
-    y_warpwise[threadIdx.y] = y;
-  }
-  __syncthreads();
-
-  float y_write = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < ty; ++i) {
-    y_write += y_warpwise[i];
-  }
- 
-  // write Y;
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    size_t y_idx = batch_idx * full_y_size + y_offset + j;
-    Y[y_idx] = vllm_add<out_T>(Y[y_idx], convert_type<float, out_T>(y_write));
-  }
-}
-
-#endif
-
-// nthrs = (2, 16, 4)
-template <int feat_in, int feat_out, size_t vec_size, int tx, int ty, int tz,
-          typename in_T, typename out_T, typename W_T>
-__global__ void
-bgmv_expand_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-
-  if (idx < 0) {
-    return;
-  }
-
-  auto block = cg::this_thread_block();
-  size_t tile_idx = blockIdx.x;
-
-  // load X;
-  vec_t<in_T, vec_size> x_vec;
-  x_vec.load(X + batch_idx * feat_in + threadIdx.x * vec_size);
-
-  // load W;
-  vec_t<W_T, vec_size> w_vec;
-  w_vec.load(W + (idx * feat_out + tile_idx * tz * ty) * feat_in +
-             block.thread_rank() * vec_size);
-
-  float sum = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-#ifndef USE_ROCM
-    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-#else
-    sum += convert_type<W_T, float>(w_vec[i]) * convert_type<in_T, float>(x_vec[i]) * scale;
-#endif
-  }
-
-  cg::thread_block_tile g = cg::tiled_partition<tx>(block);
-#pragma unroll
-  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-    sum += g.shfl_down(sum, offset);
-  }
-  sum = g.shfl(sum, 0);
-
-  if (threadIdx.x == 0) {
-#ifndef USE_ROCM
-    Y[batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
-      threadIdx.z * ty + threadIdx.y] += static_cast<out_T>(sum);
-#else
-    size_t y_idx = batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
-                   threadIdx.z * ty + threadIdx.y;
-    Y[y_idx] = vllm_add<out_T>(Y[y_idx], convert_type<float, out_T>(sum));
-#endif
-  }
-}
-
-template <int feat_in, int feat_out, typename in_T, typename out_T,
-          typename W_T>
-void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                 const W_T *__restrict__ W,
-                 const int64_t *__restrict__ indicies, int64_t y_offset,
-                 int64_t full_y_size, int64_t batch_size, int64_t num_layers,
-                 int64_t layer_idx, float scale) {
-  constexpr size_t vec_size = 8;
-  constexpr int tz = 4;
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  if constexpr (feat_in <= feat_out) {
-    static_assert(feat_in % vec_size == 0);
-    constexpr int tx = feat_in / vec_size;
-
-    static_assert((32 % tx == 0 && feat_out % (32 / tx * tz) == 0) ||
-                  (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) ||
-                  (8 % tx == 0 && feat_out % (8 / tx * tz) == 0));
-
-    if constexpr (32 % tx == 0 && feat_out % (32 / tx * tz) == 0) {
-      constexpr int ty = 32 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) {
-      constexpr int ty = 16 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else {
-      constexpr int ty = 8 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    }
-  } else {
-#ifndef USE_ROCM
-    static_assert(feat_in % (vec_size * 32) == 0 ||
-                  feat_in % (vec_size * 16) == 0 ||
-                  feat_in % (vec_size * 8) == 0);
-
-    if constexpr (feat_in % (vec_size * 32) == 0) {
-      constexpr int tx = 32;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size, vec_size * sizeof(in_T),
-                         vec_size * sizeof(W_T), tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if constexpr (feat_in % (vec_size / 2 * 32) == 0) {
-      constexpr int tx = 32;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
-                         vec_size * sizeof(in_T) / 2,
-                         vec_size * sizeof(W_T) / 2, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if constexpr (feat_in % (vec_size / 2 * 16) == 0) {
-      constexpr int tx = 16;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
-                         vec_size * sizeof(in_T) / 2,
-                         vec_size * sizeof(W_T) / 2, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    }
-#else
-    constexpr size_t rocm_warp_size = warpSize;
-
-#define CHECK_INPUT_TILEABLE_BY(vec_size_) \
-    feat_in % (rocm_warp_size * vec_size_) == 0
-
-#define LAUNCH_BGMV_SHRINK_KERNELS_ROCM(factor_, vec_size_, tx_, ty_)       \
-    if constexpr (CHECK_INPUT_TILEABLE_BY(factor_)) {                       \
-      constexpr size_t vec_size_shrink = vec_size_;                         \
-      constexpr int tx = tx_;                                               \
-      constexpr int ty = ty_;                                               \
-      dim3 nblks(feat_out, batch_size);                                     \
-      dim3 nthrs(tx, ty);                                                   \
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size_shrink,                \
-                          vec_size_shrink * sizeof(in_T),                   \
-                          vec_size_shrink * sizeof(W_T),                    \
-                          tx, ty, tz>                                       \
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,        \
-                                        full_y_size, num_layers, layer_idx, \
-                                        scale);                             \
-    }
-
-    static_assert(CHECK_INPUT_TILEABLE_BY(32) ||
-                  CHECK_INPUT_TILEABLE_BY(16) ||
-                  CHECK_INPUT_TILEABLE_BY( 8) ||
-                  CHECK_INPUT_TILEABLE_BY( 4) ||
-                  CHECK_INPUT_TILEABLE_BY( 2) ||
-                  CHECK_INPUT_TILEABLE_BY( 1));
-    
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM(32, vec_size, rocm_warp_size, 32/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM(16, vec_size, rocm_warp_size, 16/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 8, vec_size, rocm_warp_size,  8/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 4, vec_size, rocm_warp_size/(vec_size/4), vec_size/4)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 2, vec_size, rocm_warp_size/(vec_size/2), vec_size/2)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 1, vec_size, rocm_warp_size/(vec_size/1), vec_size/1)
-
-#undef CHECK_INPUT_TILEABLE_BY
-#undef LAUNCH_BGMV_SHRINK_KERNELS_ROCM
-#endif
-  }
-}
-
-#define INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)                         \
-  template void bgmv_kernel<feat_in, feat_out>(                                \
-      out_T * __restrict__ Y, const in_T *__restrict__ X,                      \
-      const W_T *__restrict__ W, const int64_t *__restrict__ indicies,         \
-      int64_t y_offset, int64_t full_y_size, int64_t batch_size,               \
-      int64_t num_layers, int64_t layer_idx, float scale);
-
-#define INST_BGMV_ONESIDE(in_T, out_T, W_T, feat_in, feat_out)                 \
-  INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)
-
-#define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide)                      \
-  INST_BGMV(narrow, wide, in_T, out_T, W_T)                                    \
-  INST_BGMV(wide, narrow, in_T, out_T, W_T)
diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py
deleted file mode 100644
index 972df5a7208c2..0000000000000
--- a/csrc/punica/bgmv/generator.py
+++ /dev/null
@@ -1,48 +0,0 @@
-DTYPES = ["fp16", "bf16", "fp32"]
-DTYPE_MAP = {
-    "fp16": "nv_half",
-    "bf16": "nv_bfloat16",
-    "fp32": "float",
-}
-
-TEMPLATE = """
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-""".lstrip()  # noqa: E501
-
-for input_dtype in DTYPES:
-    for output_dtype in DTYPES:
-        for weight_dtype in DTYPES:
-            if weight_dtype == "fp32":
-                # FP32 weights are not supported.
-                continue
-            if output_dtype == "fp32":
-                # LoRA A matrix.
-                if input_dtype != weight_dtype:
-                    # NOTE(woosuk): While Punica supports the case where the
-                    # input and weight dtypes are different, we only generate
-                    # the kernels the same dtypes to reduce the binary size.
-                    continue
-            elif input_dtype == "fp32":
-                # LoRA B matrix.
-                if output_dtype != weight_dtype:
-                    # NOTE(woosuk): While Punica supports the case where the
-                    # output and weight dtypes are different, we only generate
-                    # the kernels the same dtypes to reduce the binary size.
-                    continue
-            elif not (input_dtype == output_dtype == weight_dtype):
-                # NOTE(woosuk): While Punica supports mixed data types for
-                # input, output, and weight, we only generate the kernels with
-                # the same data types to reduce the binary size.
-                continue
-
-            kernel_definition = TEMPLATE.format(
-                input_dtype=DTYPE_MAP[input_dtype],
-                output_dtype=DTYPE_MAP[output_dtype],
-                weight_dtype=DTYPE_MAP[weight_dtype])
-            filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu"
-            with open(filename, "w") as f:
-                f.write(kernel_definition)
diff --git a/csrc/punica/bgmv/vec_dtypes.cuh b/csrc/punica/bgmv/vec_dtypes.cuh
deleted file mode 100644
index 2738892e6dc4a..0000000000000
--- a/csrc/punica/bgmv/vec_dtypes.cuh
+++ /dev/null
@@ -1,1325 +0,0 @@
-#ifndef VEC_DTYPES_CUH_
-#define VEC_DTYPES_CUH_
-
-#ifdef FLASHINFER_USE_FP8
-#include <cuda_fp8.h>
-#endif
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-#include "../type_convert.h"
-#include "../../cuda_compat.h"
-
-#define FLASHINFER_INLINE \
-  inline __attribute__((always_inline)) __device__ __host__
-
-template <typename float_t, size_t vec_size>
-struct vec_t {
-  FLASHINFER_INLINE float_t &operator[](size_t i);
-  FLASHINFER_INLINE const float_t &operator[](size_t i) const;
-  FLASHINFER_INLINE void fill(float_t val);
-  FLASHINFER_INLINE void load(const float_t *ptr);
-  FLASHINFER_INLINE void store(float_t *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src);
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr);
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const;
-  FLASHINFER_INLINE static void memcpy(float_t *dst, const float_t *src);
-};
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<src_float_t, vec_size> &src,
-                                      vec_t<tgt_float_t, vec_size> &dst) {
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-    dst[i] = tgt_float_t(src[i]);
-  }
-}
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_load_impl(const src_float_t *src_ptr,
-                                      vec_t<tgt_float_t, vec_size> &dst) {
-  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
-    dst.load(src_ptr);
-  } else {
-    vec_t<src_float_t, vec_size> tmp;
-    tmp.load(src_ptr);
-    dst.cast_from(tmp);
-  }
-}
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_store_impl(const vec_t<src_float_t, vec_size> &src,
-                                       tgt_float_t *dst_ptr) {
-  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
-    src.store(dst_ptr);
-  } else {
-    vec_t<tgt_float_t, vec_size> tmp;
-    tmp.cast_from(src);
-    tmp.store(dst_ptr);
-  }
-}
-
-#ifdef FLASHINFER_USE_FP8
-/******************* vec_t<__nv_fp8_e4m3> *******************/
-
-// __nv_fp8_e4m3 x 1
-template <>
-struct vec_t<__nv_fp8_e4m3, 1> {
-  __nv_fp8_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::fill(__nv_fp8_e4m3 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *dst = *src;
-}
-
-// __nv_fp8_e4m3 x 2
-template <>
-struct vec_t<__nv_fp8_e4m3, 2> {
-  __nv_fp8x2_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::fill(__nv_fp8_e4m3 val) {
-  data.__x =
-      (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((__nv_fp8x2_e4m3 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((__nv_fp8x2_e4m3 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8x2_e4m3 *)dst) = *((__nv_fp8x2_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 4
-
-template <>
-struct vec_t<__nv_fp8_e4m3, 4> {
-  __nv_fp8x4_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::fill(__nv_fp8_e4m3 val) {
-  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-             (__nv_fp8x4_storage_t(val.__x) << 16) |
-             (__nv_fp8x4_storage_t(val.__x) << 8) |
-             __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((__nv_fp8x4_e4m3 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((__nv_fp8x4_e4m3 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8x4_e4m3 *)dst) = *((__nv_fp8x4_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 8
-
-template <>
-struct vec_t<__nv_fp8_e4m3, 8> {
-  uint2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 8> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::fill(__nv_fp8_e4m3 val) {
-  ((__nv_fp8x4_e4m3 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-  ((__nv_fp8x4_e4m3 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8_e4m3 *)dst) = *((__nv_fp8_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 16 or more
-template <size_t vec_size>
-struct vec_t<__nv_fp8_e4m3, vec_size> {
-  uint4 data[vec_size / 16];
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)data)[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((__nv_fp8x4_e4m3 *)(&(data[i].x)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].y)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].z)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].w)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-    }
-  }
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<__nv_fp8_e5m2> *******************/
-
-// __nv_fp8_e5m2 x 1
-template <>
-struct vec_t<__nv_fp8_e5m2, 1> {
-  __nv_fp8_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::fill(__nv_fp8_e5m2 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *dst = *src;
-}
-
-// __nv_fp8_e5m2 x 2
-template <>
-struct vec_t<__nv_fp8_e5m2, 2> {
-  __nv_fp8x2_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::fill(__nv_fp8_e5m2 val) {
-  data.__x =
-      (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((__nv_fp8x2_e5m2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((__nv_fp8x2_e5m2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8x2_e5m2 *)dst) = *((__nv_fp8x2_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 4
-
-template <>
-struct vec_t<__nv_fp8_e5m2, 4> {
-  __nv_fp8x4_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::fill(__nv_fp8_e5m2 val) {
-  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-             (__nv_fp8x4_storage_t(val.__x) << 16) |
-             (__nv_fp8x4_storage_t(val.__x) << 8) |
-             __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((__nv_fp8x4_e5m2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((__nv_fp8x4_e5m2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8x4_e5m2 *)dst) = *((__nv_fp8x4_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 8
-
-template <>
-struct vec_t<__nv_fp8_e5m2, 8> {
-  uint2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 8> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::fill(__nv_fp8_e5m2 val) {
-  ((__nv_fp8x4_e5m2 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-  ((__nv_fp8x4_e5m2 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8_e5m2 *)dst) = *((__nv_fp8_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 16 or more
-
-template <size_t vec_size>
-struct vec_t<__nv_fp8_e5m2, vec_size> {
-  uint4 data[vec_size / 16];
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)data)[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((__nv_fp8x4_e5m2 *)(&(data[i].x)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].y)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].z)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].w)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-    }
-  }
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-#endif
-
-/******************* vec_t<half> *******************/
-
-// half x 1
-template <>
-struct vec_t<half, 1> {
-  half data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 1>::fill(half val) { data = val; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::load(const half *ptr) { data = *ptr; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::store(half *ptr) const { *ptr = data; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::memcpy(half *dst, const half *src) {
-  *dst = *src;
-}
-
-// half x 2
-template <>
-struct vec_t<half, 2> {
-  half2 data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 2>::fill(half val) {
-  data = make_half2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::load(const half *ptr) {
-  data = *((half2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::store(half *ptr) const {
-  *((half2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::memcpy(half *dst, const half *src) {
-  *((half2 *)dst) = *((half2 *)src);
-}
-
-// half x 4
-
-template <>
-struct vec_t<half, 4> {
-  uint2 data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 4>::fill(half val) {
-  *(half2 *)(&data.x) = make_half2(val, val);
-  *(half2 *)(&data.y) = make_half2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::load(const half *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::store(half *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::memcpy(half *dst, const half *src) {
-  *((uint2 *)dst) = *((uint2 *)src);
-}
-
-// half x 8 or more
-
-template <size_t vec_size>
-struct vec_t<half, vec_size> {
-  uint4 data[vec_size / 8];
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)data)[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(half val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      *(half2 *)(&(data[i].x)) = make_half2(val, val);
-      *(half2 *)(&(data[i].y)) = make_half2(val, val);
-      *(half2 *)(&(data[i].z)) = make_half2(val, val);
-      *(half2 *)(&(data[i].w)) = make_half2(val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const half *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(half *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<nv_bfloat16> *******************/
-
-// nv_bfloat16 x 1
-template <>
-struct vec_t<nv_bfloat16, 1> {
-  nv_bfloat16 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::fill(nv_bfloat16 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *dst = *src;
-}
-
-// nv_bfloat16 x 2
-template <>
-struct vec_t<nv_bfloat16, 2> {
-  nv_bfloat162 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::fill(nv_bfloat16 val) {
-  data = make_bfloat162(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16 *ptr) {
-  data = *((nv_bfloat162 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16 *ptr) const {
-  *((nv_bfloat162 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *((nv_bfloat162 *)dst) = *((nv_bfloat162 *)src);
-}
-
-// nv_bfloat16 x 4
-
-template <>
-struct vec_t<nv_bfloat16, 4> {
-  uint2 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::fill(nv_bfloat16 val) {
-  *(nv_bfloat162 *)(&data.x) = make_bfloat162(val, val);
-  *(nv_bfloat162 *)(&data.y) = make_bfloat162(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *((uint2 *)dst) = *((uint2 *)src);
-}
-
-// nv_bfloat16 x 8 or more
-
-template <size_t vec_size>
-struct vec_t<nv_bfloat16, vec_size> {
-  uint4 data[vec_size / 8];
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)data)[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      *(nv_bfloat162 *)(&(data[i].x)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].y)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].z)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].w)) = make_bfloat162(val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<float> *******************/
-
-// float x 1
-
-template <>
-struct vec_t<float, 1> {
-  float data;
-
-  FLASHINFER_INLINE float &operator[](size_t i) {
-    return ((float *)(&data))[i];
-  }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val);
-  FLASHINFER_INLINE void load(const float *ptr);
-  FLASHINFER_INLINE void store(float *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src);
-};
-
-FLASHINFER_INLINE void vec_t<float, 1>::fill(float val) { data = val; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::load(const float *ptr) { data = *ptr; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::store(float *ptr) const { *ptr = data; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::memcpy(float *dst, const float *src) {
-  *dst = *src;
-}
-
-// float x 2
-
-template <>
-struct vec_t<float, 2> {
-  float2 data;
-
-  FLASHINFER_INLINE float &operator[](size_t i) {
-    return ((float *)(&data))[i];
-  }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val);
-  FLASHINFER_INLINE void load(const float *ptr);
-  FLASHINFER_INLINE void store(float *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src);
-};
-
-FLASHINFER_INLINE void vec_t<float, 2>::fill(float val) {
-  data = make_float2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::load(const float *ptr) {
-  data = *((float2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::store(float *ptr) const {
-  *((float2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::memcpy(float *dst, const float *src) {
-  *((float2 *)dst) = *((float2 *)src);
-}
-
-// float x 4 or more
-template <size_t vec_size>
-struct vec_t<float, vec_size> {
-  float4 data[vec_size / 4];
-
-  FLASHINFER_INLINE float &operator[](size_t i) { return ((float *)(data))[i]; }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      data[i] = make_float4(val, val, val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const float *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      data[i] = ((float4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(float *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)dst)[i] = ((float4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t type cast *******************/
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((float2 *)(&dst.data))[i] = __half22float2(((half2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = half(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = __float22half2_rn(((float2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<nv_bfloat16, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((float2 *)(&dst.data))[i] =
-          __bfloat1622float2(((nv_bfloat162 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<nv_bfloat16, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = nv_bfloat16(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((nv_bfloat162 *)(&dst.data))[i] =
-          __float22bfloat162_rn(((float2 *)(&src.data))[i]);
-    }
-  }
-}
-
-#ifdef FLASHINFER_USE_FP8
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e4m3 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e4m3 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e4m3 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<__nv_fp8_e4m3, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(float2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((__nv_fp8x4_e4m3 *)(&dst.data))[i] =
-          __nv_fp8x4_e4m3(((float4 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<__nv_fp8_e4m3, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(half2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      // NOTE(Zihao): need to double check if we properly handle flo and fhi
-      ((__nv_fp8x4_e4m3 *)(&dst.data))[i] = __nv_fp8x4_e4m3(
-          ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e5m2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e5m2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e5m2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<__nv_fp8_e5m2, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e5m2(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(float2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((__nv_fp8x4_e5m2 *)(&dst.data))[i] =
-          __nv_fp8x4_e5m2(((float4 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<__nv_fp8_e5m2, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(half2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      // NOTE(Zihao): need to double check if we properly handle flo and fhi
-      ((__nv_fp8x4_e5m2 *)(&dst.data))[i] = __nv_fp8x4_e5m2(
-          ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]);
-    }
-  }
-}
-
-#endif  // FLASHINFER_USE_FP8
-
-#endif  // VEC_DTYPES_CUH_
diff --git a/csrc/punica/punica_ops.cu b/csrc/punica/punica_ops.cu
deleted file mode 100644
index dd29820144b34..0000000000000
--- a/csrc/punica/punica_ops.cu
+++ /dev/null
@@ -1,569 +0,0 @@
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cstdint>
-
-#include "type_convert.h"
-#include "../cuda_compat.h"
-#include "bgmv/bgmv_config.h"
-
-
-//====== utils ======
-
-inline void check_shape(const torch::Tensor &a, const torch::Tensor &b,
-                        const char *a_name, const char *b_name) {
-  TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ",
-              a.dim(), " vs ", b.dim());
-  for (int i = 0; i < a.dim(); ++i) {
-    TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name,
-                ".size(", i, ")");
-  }
-}
-
-inline constexpr uint64_t pack_u32(uint32_t a, uint32_t b) {
-  return (uint64_t(a) << 32) | uint64_t(b);
-}
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
-
-#define CHECK_CONTIGUOUS(x)                                                    \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-#define CHECK_DIM(d, x)                                                        \
-  TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
-
-#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b)
-
-#define CHECK_EQ(a, b)                                                         \
-  TORCH_CHECK(a == b, "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
-
-//====== bgmv ======
-
-template <typename in_T, typename out_T, typename W_T>
-inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
-                               const int64_t *lora_indices,
-                               uint32_t in_features, uint32_t out_features,
-                               int64_t y_offset, int64_t full_y_size,
-                               int64_t batch_size, int64_t num_layers,
-                               int64_t layer_idx, float scale) {
-  // NOTE(woosuk): While Punica supports various combinations of input/output
-  // data types, we limit the supported data types to reduce the binary size.
-  constexpr bool is_input_float = std::is_same<in_T, float>::value;
-  constexpr bool is_output_float = std::is_same<out_T, float>::value;
-  if (is_input_float) {
-    if (!std::is_same<out_T, W_T>::value) {
-      return false;
-    }
-  } else if (is_output_float) {
-    if (!std::is_same<in_T, W_T>::value) {
-      return false;
-    }
-  } else if (!(std::is_same<in_T, W_T>::value &&
-               std::is_same<out_T, W_T>::value)) {
-    return false;
-  }
-
-  switch (pack_u32(in_features, out_features)) {
-#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out)                   \
-  case pack_u32(feat_in, feat_out):                                            \
-    bgmv_kernel<feat_in, feat_out>(Y, X, W, lora_indices, y_offset,            \
-                                   full_y_size, batch_size, num_layers,        \
-                                   layer_idx, scale);                          \
-    break;
-#define CASE(_in_T, _out_T, _W_T, narrow, wide)                                \
-  CASE_ONESIDE(in_T, out_T, W_T, narrow, wide)                                 \
-  CASE_ONESIDE(in_T, out_T, W_T, wide, narrow)
-
-    FOR_BGMV_WIDE_NARROW(CASE, _, _, _)
-    FOR_INST_BGMV_WIDE_NARROW(CASE_ONESIDE, _, _, _)
-#undef CASE
-#undef CASE_ONESIDE
-  default:
-    return false;
-  }
-  return true;
-}
-
-void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale) {
-  CHECK_INPUT(y);
-  CHECK_INPUT(x);
-  CHECK_INPUT(w);
-  CHECK_INPUT(indicies);
-
-  CHECK_DIM(2, y);
-  CHECK_DIM(2, x);
-  CHECK_DIM(4, w);
-  CHECK_DIM(1, indicies);
-
-  int64_t B = x.size(0);
-  int64_t h_in = x.size(1);
-  int64_t h_out = y.size(1);
-  int64_t num_layers = w.size(1);
-  CHECK_EQ(w.size(3), h_in);
-  CHECK_EQ(w.size(2), h_out);
-  CHECK_EQ(indicies.size(0), x.size(0));
-  CHECK_EQ(y.size(0), x.size(0));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-  bool ok = false;
-  if (h_in <= 128512 && h_out <= 128512) {
-    // TODO: See if we can get rid of this massive nested switch
-    switch (x.scalar_type()) {
-    case at::ScalarType::Half:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::BFloat16:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::Float:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    default:
-      break;
-    }
-  }
-  TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
-              " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type());
-}
-
-void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
-                             int64_t y_offset) {
-  CHECK_INPUT(y);
-  CHECK_INPUT(x);
-  CHECK_INPUT(w);
-  CHECK_INPUT(indicies);
-
-  CHECK_DIM(2, y);
-  CHECK_DIM(2, x);
-  CHECK_DIM(4, w);
-  CHECK_DIM(1, indicies);
-
-  int64_t B = x.size(0);
-  int64_t num_layers = w.size(1);
-  int64_t full_y_size = y.size(1);
-  CHECK_EQ(w.size(3), h_in);
-  CHECK_EQ(w.size(2), h_out);
-  CHECK_EQ(indicies.size(0), x.size(0));
-  CHECK_EQ(y.size(0), x.size(0));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-  bool ok = false;
-  if (h_in <= 128512 && h_out <= 128512) {
-    // TODO: See if we can get rid of this massive nested switch
-    switch (x.scalar_type()) {
-    case at::ScalarType::Half:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::BFloat16:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::Float:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    default:
-      break;
-    }
-  }
-  TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
-              " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type());
-}
diff --git a/csrc/punica/punica_ops.h b/csrc/punica/punica_ops.h
deleted file mode 100644
index 5d625d0564f75..0000000000000
--- a/csrc/punica/punica_ops.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-
-void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale);
-
-void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
-                             int64_t y_offset);
diff --git a/csrc/punica/torch_bindings.cpp b/csrc/punica/torch_bindings.cpp
deleted file mode 100644
index 894e229b6d9db..0000000000000
--- a/csrc/punica/torch_bindings.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "registration.h"
-#include "punica_ops.h"
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
-  m.def(
-      "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
-      "layer_idx, float scale) -> ()");
-  m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
-
-  m.def(
-      "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
-      "Tensor indicies, int layer_idx,"
-      "float scale, int h_in, int h_out,"
-      "int y_offset) -> ()");
-  m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/punica/type_convert.h b/csrc/punica/type_convert.h
deleted file mode 100644
index dff7ce49283d7..0000000000000
--- a/csrc/punica/type_convert.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CSRC__PUNICA__TYPE_CONVERT_H__
-#define CSRC__PUNICA__TYPE_CONVERT_H__
-
-#ifndef USE_ROCM
-
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
-#else
-
-#include <hip/hip_bf16.h>
-#include <hip/hip_fp16.h>
-
-#define __TYPE_CONVERT__HOST_DEVICE__ __host__ __device__
-
-typedef __half nv_half;
-typedef __hip_bfloat16 nv_bfloat16;
-typedef __hip_bfloat162 nv_bfloat162;
-
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 val) {
-  return __hip_bfloat162{val, val};
-}
-
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 vall, __hip_bfloat16 valr) {
-  return __hip_bfloat162{vall, valr};
-}
-
-template <typename T_src, typename T_dst>
-__TYPE_CONVERT__HOST_DEVICE__
-inline T_dst convert_type(T_src val) {
-  return static_cast<T_dst>(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline float convert_type<__half, float>(__half val) {
-  return __half2float(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __half convert_type<float, __half>(float val) {
-  return __float2half(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline float convert_type<__hip_bfloat16, float>(__hip_bfloat16 val) {
-  return __bfloat162float(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat16 convert_type<float, __hip_bfloat16>(float val) {
-  return __float2bfloat16(val);
-}
-
-template <typename T>
-__TYPE_CONVERT__HOST_DEVICE__
-inline T vllm_add(T a, T b) {
-  return a + b;
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __half vllm_add<__half>(__half a, __half b) {
-  return __hadd(a, b);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat16 vllm_add<__hip_bfloat16>(__hip_bfloat16 a, __hip_bfloat16 b) {
-  return __hadd(a, b);
-}
-
-#undef __TYPE_CONVERT__HOST_DEVICE__
-
-#endif // USE_ROCM
-
-#endif // CSRC__PUNICA__TYPE_CONVERT_H__
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index fe041e03a1b6c..0253717da3cda 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -66,7 +66,6 @@ You can also build and install vLLM from source:
 
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
-    $ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability
     $ pip install -e .  # This may take 5-10 minutes.
 
 .. tip::
diff --git a/setup.py b/setup.py
index 72ef26f15e405..63c1f466d2910 100644
--- a/setup.py
+++ b/setup.py
@@ -181,9 +181,6 @@ def configure(self, ext: CMakeExtension) -> None:
         # match.
         cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
 
-        if _install_punica():
-            cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON']
-
         #
         # Setup parallelism and build tool
         #
@@ -274,10 +271,6 @@ def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
 
-def _install_punica() -> bool:
-    return envs.VLLM_INSTALL_PUNICA_KERNELS
-
-
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -446,9 +439,6 @@ def _read_requirements(filename: str) -> List[str]:
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
-    if _install_punica():
-        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
-
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index 3c53f7decc6eb..713e868986a5f 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -1,14 +1,17 @@
 import gc
+from unittest.mock import patch
 
 import pytest
 import torch
 import triton
 import triton.language as tl
 
-from vllm.model_executor.layers.ops.sample import (_uniform_to_exponential,
+from vllm.model_executor.layers.ops.sample import (_sample_triton,
+                                                   _uniform_to_exponential,
                                                    sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
+from vllm.triton_utils.libentry import LibEntry
 from vllm.triton_utils.sample import (MAX_TRITON_N_COLS,
                                       get_num_triton_sampler_splits)
 
@@ -76,15 +79,20 @@ def test_sample_decoding_only(random_sampling, max_best_of,
     seeds = torch.randint(1,
                           torch.iinfo(torch.long).max, (n_splits, bs),
                           device="cuda").mul_(random_sampling_mask)
-    sampled_tokens, sampled_logprobs, sampled_modified_probs = sample(
-        probs=probs,
-        logprobs=logprobs,
-        sample_indices=sample_indices,
-        seeds=seeds,
-        max_best_of=max_best_of,
-        modify_greedy_probs=modify_greedy_probs,
-        save_logprobs=save_logprobs,
-        _save_modified_probs=True)
+    #The current _sample_triton does not utilize the
+    # libentry decoration. The purpose of adding this patch is to test
+    # the correctness of libentry.
+    with patch("vllm.model_executor.layers.ops.sample._sample_triton",
+               LibEntry(_sample_triton)):
+        sampled_tokens, sampled_logprobs, sampled_modified_probs = sample(
+            probs=probs,
+            logprobs=logprobs,
+            sample_indices=sample_indices,
+            seeds=seeds,
+            max_best_of=max_best_of,
+            modify_greedy_probs=modify_greedy_probs,
+            save_logprobs=save_logprobs,
+            _save_modified_probs=True)
     assert sampled_tokens.shape == (bs, max_best_of)
     for i in range(bs):
         assert torch.all(sampled_tokens[i] == i * (vocab_size // bs))
@@ -130,6 +138,7 @@ def test_sample_decoding_only(random_sampling, max_best_of,
                          [SINGLE_SPLIT_VOCAB_SIZE, MULTI_SPLIT_VOCAB_SIZE])
 def test_sample_prompt_logprobs(random_sampling, max_best_of,
                                 modify_greedy_probs, seed, vocab_size):
+
     set_random_seed(seed)
     prompt_sizes = [16, 32, 64, 128] * 2
     samples = 8
@@ -157,14 +166,17 @@ def test_sample_prompt_logprobs(random_sampling, max_best_of,
     seeds = torch.randint(1,
                           torch.iinfo(torch.long).max, (n_splits, samples),
                           device="cuda").mul_(random_sampling_mask)
-    sampled_tokens, sampled_logprobs, _ = sample(
-        probs=probs,
-        logprobs=logprobs,
-        sample_indices=sample_indices,
-        seeds=seeds,
-        max_best_of=max_best_of,
-        modify_greedy_probs=modify_greedy_probs,
-        save_logprobs=True)
+    #ditto
+    with patch("vllm.model_executor.layers.ops.sample._sample_triton",
+               LibEntry(_sample_triton)):
+        sampled_tokens, sampled_logprobs, _ = sample(
+            probs=probs,
+            logprobs=logprobs,
+            sample_indices=sample_indices,
+            seeds=seeds,
+            max_best_of=max_best_of,
+            modify_greedy_probs=modify_greedy_probs,
+            save_logprobs=True)
     assert sampled_tokens.shape == (samples, max_best_of)
     assert sampled_logprobs.shape == (samples, max_best_of)
     for i, t in enumerate(sample_indices):
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 709246179bfe4..478bb86b78610 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -37,7 +37,7 @@ def test_gemma_lora(gemma_lora_files):
     expected_lora_output = [
         "more important than knowledge.\nAuthor: Albert Einstein\n",
         "everyone else is already taken.\nAuthor: Oscar Wilde\n",
-        "so little time\nAuthor: Frank Zappa\n",
+        "so little time.\nAuthor: Frank Zappa\n",
     ]
 
     output1 = do_sample(llm, gemma_lora_files, lora_id=1)
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 7207af6b1a4b3..6f33f56616fcd 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -26,7 +26,8 @@
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
 from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
-                              PackedLoRALayerWeights, convert_mapping)
+                              PackedLoRALayerWeights)
+from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -47,6 +48,9 @@
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
+# We will launch different triton kernels between the prefill and decode
+# stages, so we need to verify this. prefill stage(True) or decode stage(False)
+STAGES = [True, False]
 
 
 def get_random_id_to_index(num_loras: int,
@@ -182,10 +186,12 @@ def create_random_inputs(
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -204,7 +210,7 @@ def create_random_embedding_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         embedding, lora_embedding = create_random_embedding_layer()
-
+        lora_embedding.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_embedding,
@@ -217,12 +223,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info)
 
         lora_result = lora_embedding(torch.cat(inputs))
 
@@ -255,12 +261,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
 
         lora_result = lora_embedding(torch.cat(inputs))
         expected_result = embedding(torch.cat(inputs))
@@ -278,11 +284,13 @@ def create_random_embedding_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+@pytest.mark.parametrize("stage", STAGES)
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
-                                        vocab_size) -> None:
+                                        vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -318,6 +326,7 @@ def create_random_embedding_layer():
             generate_embeddings_tensor=256,
         )
 
+        lora_embedding.set_mapping(punica_wrapper)
         # All embeddings tensors have the same shape.
         embeddings_tensors = [
             lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
@@ -334,8 +343,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
         original_inputs = deepcopy(inputs)
 
         # Force some of the inputs to be in the extended embeddings range
@@ -349,11 +362,6 @@ def create_random_embedding_layer():
                 (embedding_id + 1) * embeddings_tensor_len - 1)
             original_input_[-2] = vocab_size + embeddings_tensor_len - 1
 
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
-
         expanded_embedding.weight[vocab_size:vocab_size +
                                   (embeddings_tensor_len *
                                    max_loras)] = torch.cat(embeddings_tensors)
@@ -390,15 +398,13 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
         original_inputs = deepcopy(inputs)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
-
         lora_result = lora_embedding(torch.cat(original_inputs))
         expected_result = expanded_embedding(torch.cat(inputs))
 
@@ -413,11 +419,13 @@ def create_random_embedding_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-def test_lm_head_logits_processor(dist_init, num_loras, device,
-                                  vocab_size) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
+                                  stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -443,7 +451,7 @@ def _pretest():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, logits_processor, lora_logits_processor = _pretest()
-
+        lora_logits_processor.set_mapping(punica_wrapper)
         # NOTE: all the generated loras share the same embeddings tensor.
         lora_dict, _ = populate_loras(
             id_to_index,
@@ -461,17 +469,17 @@ def _pretest():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        input_ = torch.rand(20, 1024)
-        mapping_info = convert_mapping(
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             vocab_size,
             lora_config.lora_extra_vocab_size,
         )
-        lora_logits_processor.set_mapping(*mapping_info, )
+        input_ = torch.rand(20, 1024)
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
@@ -510,12 +518,16 @@ def _pretest():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
-        lora_logits_processor.set_mapping(*mapping_info, )
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
@@ -538,10 +550,12 @@ def _pretest():
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device) -> None:
+                         device, stage) -> None:
 
     torch.set_default_device(device)
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -575,7 +589,7 @@ def create_random_linear_parallel_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, lora_linear = create_random_linear_parallel_layer()
-
+        lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_linear,
@@ -589,16 +603,16 @@ def create_random_linear_parallel_layer():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info, )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
@@ -628,11 +642,12 @@ def create_random_linear_parallel_layer():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        512, lora_config.lora_extra_vocab_size)
-        lora_linear.set_mapping(*mapping_info, )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
@@ -649,10 +664,12 @@ def create_random_linear_parallel_layer():
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device) -> None:
+                                device, stage) -> None:
 
     torch.set_default_device(device)
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -707,7 +724,7 @@ class FakeConfig:
         id_to_index = get_random_id_to_index(num_loras, max_loras)
 
         linear, lora_linear = create_column_parallel_packed_layer()
-
+        lora_linear.set_mapping(punica_wrapper)
         lora_dict, sublora_dict = populate_loras(
             id_to_index,
             layer=lora_linear,
@@ -722,16 +739,17 @@ class FakeConfig:
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
@@ -762,16 +780,18 @@ class FakeConfig:
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info)
+        # lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
@@ -803,7 +823,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
-
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -825,6 +845,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         is_neox_style,
     )
     lora_rope = LinearScalingRotaryEmbeddingWithLora(rope)
+    lora_rope.set_mapping(punica_wrapper)
     lora_rope.create_lora_weights(max_loras, lora_config)
     linear_rope = get_rope(head_size, rotary_dim, max_position, base,
                            is_neox_style, {
@@ -840,6 +861,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         input_range=(0, lora_config.lora_extra_vocab_size),
         input_type=torch.float16,
     )
+
     lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
     long_lora_context = LongContextLoRAContext(list(scaling_factors),
                                                rotary_dim)
@@ -854,7 +876,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
     for i in range(len(scaling_factors)):
         long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
             scaling_factors[i], 0)
-    mapping_info = convert_mapping(
+    punica_wrapper.update_metadata(
         lora_mapping,
         id_to_index,
         max_loras,
@@ -862,7 +884,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         lora_config.lora_extra_vocab_size,
         long_lora_context=long_lora_context,
     )
-    lora_rope.set_mapping(*mapping_info)
+    # lora_rope.set_mapping(*mapping_info)
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
     query = torch.randn(batch_size,
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
deleted file mode 100644
index 3415d36b7e341..0000000000000
--- a/tests/lora/test_lora.py
+++ /dev/null
@@ -1,224 +0,0 @@
-import pytest
-import torch
-
-from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
-
-from .utils import DummyLoRAManager
-
-TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
-QKV_TENSOR_SIZES = [
-    (8192, 1024, 1024),
-    (8192 // 8, 1024 // 8, 1024 // 8),
-    (4096, 4096, 4096),
-    (4096 // 2, 4096 // 2, 4096 // 2),
-]
-BATCH_SIZES = [8, 32, 256]
-RANKS = [8]
-DTYPES = [torch.float16]
-TOLERANCES = {
-    torch.float16: (5e-3, 5e-3),
-    torch.bfloat16: (3e-2, 2e-2),
-}
-
-
-@pytest.mark.parametrize("m", TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora(m, n, k, rank, dtype) -> None:
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight = torch.rand([m, n], device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name, weight, rank=rank)
-    lora = manager.get_module_lora(module_name)
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
-
-    lora_a_stack = torch.zeros(8,
-                               1,
-                               lora.lora_a.shape[1],
-                               lora.lora_a.shape[0],
-                               device="cuda",
-                               dtype=dtype)
-    lora_b_stack = torch.zeros(8,
-                               1,
-                               lora.lora_b.shape[1],
-                               lora.lora_b.shape[0],
-                               device="cuda",
-                               dtype=dtype)
-    for i in range(lora_a_stack.shape[0]):
-        lora_a_stack[i][0] = lora.lora_a.T
-        lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
-
-    output = torch.zeros(k, m, device="cuda", dtype=dtype)
-    _apply_lora(
-        input, lora_a_stack, lora_b_stack,
-        torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
-        output)
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora(input, lora_a_stack, lora_b_stack,
-                torch.full((len(input), ), -1, device="cuda"), output)
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
-
-
-@pytest.mark.parametrize("m", TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
-    if m % 2 != 0:
-        pytest.skip("m must be divisible by 2")
-    if m // 2 not in TENSOR_SIZES:
-        pytest.skip("m//2 must be in TENSOR_SIZES")
-
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name + "1", weight, rank=rank)
-    lora_1 = manager.get_module_lora(module_name + "1")
-    manager.init_random_lora(module_name + "2", weight, rank=rank)
-    lora_2 = manager.get_module_lora(module_name + "2")
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = torch.cat([
-        input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
-        input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
-    ],
-                         dim=1)
-
-    lora_a_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_1.lora_a.shape[1],
-                    lora_1.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    lora_b_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_1.lora_b.shape[1],
-                    lora_1.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    for i in range(lora_a_stacks[0].shape[0]):
-        lora_a_stacks[0][i][0] = lora_1.lora_a.T
-        lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
-        lora_a_stacks[1][i][0] = lora_2.lora_a.T
-        lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
-
-    output = torch.zeros(k, m, device="cuda", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0,
-                      lora_a_stacks[0].shape[0], (len(input), ),
-                      device="cuda"), output, (m // 2, m // 2))
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
-                              torch.full((len(input), ), -1, device="cuda"),
-                              output, (m // 2, m // 2))
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
-
-
-@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
-    weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name + "q", weight_q, rank=rank)
-    lora_q = manager.get_module_lora(module_name + "q")
-    manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
-    lora_k = manager.get_module_lora(module_name + "k")
-    manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
-    lora_v = manager.get_module_lora(module_name + "v")
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = torch.cat([
-        input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
-        input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
-        input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
-    ],
-                         dim=1)
-
-    lora_a_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_q.lora_a.shape[1],
-                    lora_q.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype)
-    ] + [
-        torch.zeros(8,
-                    1,
-                    lora_k.lora_a.shape[1],
-                    lora_k.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    lora_b_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_q.lora_b.shape[1],
-                    lora_q.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype)
-    ] + [
-        torch.zeros(8,
-                    1,
-                    lora_k.lora_b.shape[1],
-                    lora_k.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    for i in range(lora_a_stacks[0].shape[0]):
-        lora_a_stacks[0][i][0] = lora_q.lora_a.T
-        lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
-        lora_a_stacks[1][i][0] = lora_k.lora_a.T
-        lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
-        lora_a_stacks[2][i][0] = lora_v.lora_a.T
-        lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
-
-    output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0,
-                      lora_a_stacks[0].shape[0], (len(input), ),
-                      device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
-                              torch.full((len(input), ), -1, device="cuda"),
-                              output, (qkv[0], qkv[1], qkv[2]))
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
deleted file mode 100644
index dbeb16cb21ad3..0000000000000
--- a/tests/lora/test_punica.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Based on code from https://github.com/punica-ai/punica
-
-import pytest
-import torch
-
-import vllm.lora.punica as punica
-
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (5e-3, 5e-3),
-        torch.bfloat16: (3e-2, 2e-2),
-        torch.float32: (None, None),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-
-
-def _lora_ref_impl(
-    y_final: torch.Tensor,
-    x: torch.Tensor,
-    wa_T_all: torch.Tensor,
-    wb_T_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    y_stage_1 = torch.empty(
-        (x.size(0), wa_T_all.size(-2)),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    bs = x.shape[0]
-    s = torch.tensor(scale, dtype=torch.float32, device=x.device)
-    for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
-        xi = x[i].unsqueeze(0).to(torch.float32)
-        wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
-        if wb_T_all is not None:
-            wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
-                                                         -2).to(torch.float32)
-
-        tmp = xi @ wa
-        y_stage_1[i] = tmp.squeeze(0)
-        y_final[i] += ((tmp @ wb).squeeze(0) *
-                       s if wb_T_all is not None else y_stage_1[i])
-    return y_final, y_stage_1
-
-
-H1 = H2 = [
-    128,
-    256,
-    512,
-    896,
-    1024,
-    1152,
-    1216,
-    1280,
-    1536,
-    1664,
-    2048,
-    2240,
-    2304,
-    2368,
-    2432,
-    2560,
-    2752,
-    3072,
-    3328,
-    3456,
-    3584,
-    3712,
-    4096,
-    4480,
-    4608,
-    4736,
-    4864,
-    5120,
-    5504,
-    5632,
-    5888,
-    6144,
-    6400,
-    6848,
-    6912,
-    7168,
-    7424,
-    8192,
-    8960,
-    9216,
-    9472,
-    10240,
-    11008,
-    11264,
-    13824,
-    14336,
-    14784,
-    14848,
-    15360,
-    18944,
-    22016,
-    22528,
-    24576,
-    27392,
-    27648,
-    29568,
-    29696,
-    32000,
-    32256,
-    32512,
-    32768,
-    33024,
-    36864,
-    43264,
-    49152,
-    49408,
-    60544,
-    60672,
-    64000,
-    64256,
-    102400,
-    102656,
-    128000,
-    128256,
-]
-H2 = [64] + H2
-R = [1, 2, 4]
-SEED = [0xabcdabcd987]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("r", R)
-@pytest.mark.parametrize("seed", SEED)
-@torch.inference_mode()
-def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    bs = 32
-    dtype = getattr(torch, dtype_str)
-    device = torch.device("cuda")
-
-    wa_T_all = torch.randn(num_loras,
-                           num_layers,
-                           r,
-                           h1,
-                           dtype=dtype,
-                           device=device)
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype, device=device)
-        y = torch.randn(bs, r, dtype=dtype, device=device)
-
-        y_ref = y.clone()
-        _lora_ref_impl(
-            y_ref,
-            x,
-            wa_T_all,
-            None,
-            indices,
-            layer_idx,
-            1.0,
-        )
-
-        y_our = y.clone()
-        punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
-
-        assert_close(y_ref, y_our)
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("h2", H2)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_lora_correctness(dtype_str, h1, h2, seed, device):
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    r = 8
-    bs = 32
-    scale = 0.123
-    dtype = getattr(torch, dtype_str)
-    torch.set_default_device(device)
-
-    wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype)
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype)
-        y = torch.randn(bs, h2, dtype=dtype)
-
-        y_ref = y.clone()
-        _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
-
-        y_our = y.clone()
-        punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
-                        scale)
-
-        assert_close(y_ref, y_our)
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("h2", H2)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_lora_correctness_slice(dtype_str, h1, h2, seed, device):
-    if h2 % 3 != 0 or h2 // 3 not in H1:
-        pytest.skip("h2 must be divisible by 3 and in supported shapes")
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    r = 8
-    bs = 32
-    scale = 0.123
-    dtype = getattr(torch, dtype_str)
-    torch.set_default_device(device)
-
-    wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-    wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-    wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype)
-        y = torch.randn(bs, h2, dtype=dtype)
-        s = h2 // 3
-
-        y_ref = y.clone()
-        _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
-                       layer_idx, scale)
-        _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
-                       layer_idx, scale)
-        _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
-                       layer_idx, scale)
-
-        y_our = y.clone()
-        punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
-                              layer_idx, scale, 0, s)
-        punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
-                              layer_idx, scale, s, s)
-        punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
-                              layer_idx, scale, s * 2, s)
-
-        assert_close(y_ref[:, :s], y_our[:, :s])
-        assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
-        assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])
diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py
new file mode 100644
index 0000000000000..c052568dc2e33
--- /dev/null
+++ b/tests/lora/test_punica_sizes.py
@@ -0,0 +1,408 @@
+"""
+This script is mainly used to tests various hidden_sizes. We have collected the 
+hidden_sizes included in the LoRA models currently supported by vLLM. It tests
+whether the corresponding Triton kernel can run normally when tensor parallelism
+is set to [1, 2, 4, 8, 16, 32, 64].
+"""
+import random
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.triton_utils.libentry import LibEntry
+
+from .utils import (generate_data, generate_data_for_expand_nslices,
+                    ref_torch_groupgemm)
+
+HIDDEN_SIZES = [
+    128,
+    256,
+    512,
+    896,
+    1024,
+    1152,
+    1216,
+    1280,
+    1536,
+    1664,
+    2048,
+    2240,
+    2304,
+    2368,
+    2432,
+    2560,
+    2752,
+    3072,
+    3328,
+    3456,
+    3584,
+    3712,
+    4096,
+    4480,
+    4608,
+    4736,
+    4864,
+    5120,
+    5504,
+    5632,
+    5888,
+    6144,
+    6400,
+    6848,
+    6912,
+    7168,
+    7424,
+    8192,
+    8960,
+    9216,
+    9472,
+    10240,
+    11008,
+    11264,
+    13824,
+    14336,
+    14784,
+    14848,
+    15360,
+    18944,
+    22016,
+    22528,
+    24576,
+    27392,
+    27648,
+    29568,
+    29696,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    49408,
+    60544,
+    60672,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+#The size of TP
+divisibility = [1, 2, 4, 8, 16, 32, 64]
+
+all_hidden_size = []
+for div in divisibility:
+    for hidden_size in HIDDEN_SIZES:
+        all_hidden_size.append(hidden_size // div)
+
+HIDDEN_SIZES = list(set(all_hidden_size))
+
+BATCHES = [4]
+NUM_LORA = [4]
+DTYPES = [torch.float16, torch.bfloat16]
+MAX_RANKS = [32]
+SCALES = [0.5]
+SEED = [0]
+CUDA_DEVICES = [f"cuda:{0}"]
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_sgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 128
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            add_inputs=True,
+        )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
+    from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 1
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    if op_type == "shrink":
+        # The current _bgmv_shrink_kernel does not require the libentry
+        # decoration. The purpose of adding this patch is to test the
+        # correctness of libentry.
+        with patch(
+                "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
+                LibEntry(_bgmv_shrink_kernel),
+        ):
+            bgmv_shrink(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                scaling,
+            )
+    else:
+        # ditto
+        with patch(
+                "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
+                LibEntry(_bgmv_expand_kernel),
+        ):
+            bgmv_expand(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                add_inputs=True,
+            )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_expand_nslices(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    seq_length = 128 if op_type == "sgmv" else 1
+    (
+        inputs_tensor,
+        lora_weights_lst,
+        our_outputs,
+        ref_outputs,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    slice_offset = 0
+    for index in range(nslices):
+        lora_weights = lora_weights_lst[index]
+        if op_type == "sgmv":
+            sgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                slice_offset,
+                hidden_size,
+                add_inputs=True,
+            )
+        else:
+            # The current _bgmv_expand_slice_kernel does not require the
+            # libentry decoration. The purpose of adding this patch is to test
+            # the correctness of libentry.
+            with patch(
+                    "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
+                    LibEntry(_bgmv_expand_slice_kernel),
+            ):
+                bgmv_expand_slice(
+                    inputs_tensor,
+                    lora_weights,
+                    our_outputs,
+                    indices,
+                    slice_offset,
+                    slice_size=hidden_size,
+                    add_inputs=True,
+                )
+        ref_torch_groupgemm(
+            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+            inputs_tensor,
+            lora_weights,
+            lora_indices_tensor,
+            seq_len_tensor,
+            batches,
+            1.0,
+            op_type="expand",
+        )
+
+        slice_offset += hidden_size
+    assert_close(our_outputs, ref_outputs)
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py
new file mode 100644
index 0000000000000..7e73ea67ee5f4
--- /dev/null
+++ b/tests/lora/test_punica_variation.py
@@ -0,0 +1,342 @@
+"""
+This script is mainly used to test whether trtion kernels can run normally 
+under different conditions, including various batches, numbers of LoRA , and 
+maximum ranks.
+"""
+import random
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.triton_utils.libentry import LibEntry
+
+from .utils import (generate_data, generate_data_for_expand_nslices,
+                    ref_torch_groupgemm)
+
+HIDDEN_SIZES = [3424, 4096, 4097]
+
+BATCHES = [1, 4, 16, 32]
+NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
+DTYPES = [torch.float16, torch.bfloat16]
+MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
+SCALES = [0.5]
+SEED = [0]
+CUDA_DEVICES = [f"cuda:{0}"]
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_sgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 128
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            add_inputs=True,
+        )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
+    from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 1
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    if op_type == "shrink":
+        # The current _bgmv_shrink_kernel does not require the libentry
+        # decoration. The purpose of adding this patch is to test the
+        # correctness of libentry.
+        with patch(
+                "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
+                LibEntry(_bgmv_shrink_kernel),
+        ):
+            bgmv_shrink(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                scaling,
+            )
+    else:
+        # ditto
+        with patch(
+                "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
+                LibEntry(_bgmv_expand_kernel),
+        ):
+            bgmv_expand(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                add_inputs=True,
+            )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_expand_nslices(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    seq_length = 128 if op_type == "sgmv" else 1
+    (
+        inputs_tensor,
+        lora_weights_lst,
+        our_outputs,
+        ref_outputs,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    slice_offset = 0
+    for index in range(nslices):
+        lora_weights = lora_weights_lst[index]
+        if op_type == "sgmv":
+            sgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                slice_offset,
+                hidden_size,
+                add_inputs=True,
+            )
+        else:
+            # The current _bgmv_expand_slice_kernel does not require the
+            # libentry decoration. The purpose of adding this patch is to test
+            # the correctness of libentry.
+            with patch(
+                    "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
+                    LibEntry(_bgmv_expand_slice_kernel),
+            ):
+                bgmv_expand_slice(
+                    inputs_tensor,
+                    lora_weights,
+                    our_outputs,
+                    indices,
+                    slice_offset,
+                    slice_size=hidden_size,
+                    add_inputs=True,
+                )
+        ref_torch_groupgemm(
+            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+            inputs_tensor,
+            lora_weights,
+            lora_indices_tensor,
+            seq_len_tensor,
+            batches,
+            1.0,
+            op_type="expand",
+        )
+
+        slice_offset += hidden_size
+    assert_close(our_outputs, ref_outputs)
+
+
+if __name__ == "__main__":
+    from itertools import product
+
+    lst = list(
+        product(
+            BATCHES,
+            NUM_LORA,
+            MAX_RANKS,
+            [1.0],
+            [torch.float16],
+            ["expand"],
+            SEED,
+            CUDA_DEVICES,
+        ))
+    for ele in lst:
+        test_punica_bgmv(*ele)
+        print(f"{ele},pass")
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 8fd968c69e58f..2370c693e9534 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -64,14 +64,16 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
     # if torch.cuda.device_count() < tp_size:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    llm = vllm.LLM(model=model.model_path,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   max_model_len=400,
-                   tensor_parallel_size=tp_size,
-                   quantization=model.quantization,
-                   trust_remote_code=True)
+    llm = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_model_len=400,
+        tensor_parallel_size=tp_size,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True)
 
     if model.quantization is None:
         expected_no_lora_output = [
@@ -156,24 +158,28 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
     # if torch.cuda.device_count() < 2:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
 
-    llm_tp1 = vllm.LLM(model=model.model_path,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=1,
-                       quantization=model.quantization,
-                       trust_remote_code=True)
+    llm_tp1 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True)
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
 
     del llm_tp1
     cleanup()
 
-    llm_tp2 = vllm.LLM(model=model.model_path,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=2,
-                       quantization=model.quantization)
+    llm_tp2 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=2,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization)
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 
     del llm_tp2
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index b73cf5bf55324..00f8e26d1041f 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -86,3 +86,151 @@ def init_packed_lora(
         packed_lora = PackedLoRALayerWeights.pack(base_loras)
         self.set_module_lora(module_name, packed_lora)
         return packed_lora
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+def ref_torch_groupgemm(
+    out_tensor,
+    inputs,
+    lora_weights,
+    lora_indices_tensor,
+    seq_len_tensor,
+    batches,
+    scaling,
+    op_type,
+) -> torch.Tensor:
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batches), seq_len_tensor):
+        input_weight = inputs[current_offset:b_length + current_offset, :]
+        current_offset += b_length
+        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
+        result = torch.nn.functional.linear(input_weight, lora_weight)
+        result *= scaling
+        out_list.append(result)
+    cat_result = torch.cat(out_list, dim=0)
+    if op_type == "expand":
+        out_tensor += cat_result
+    else:
+        out_tensor.copy_(cat_result)
+    return
+
+
+def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
+                  op_type, device):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    if op_type == "shrink":
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, max_rank, hidden_size),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # shrink op need atomic_add, so output is initinized by 0
+        ref_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=dtype,
+                                     device=inputs_tensor.device)
+        # NOTE  shrink kernel using torch.float32 as output type
+        our_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=torch.float32).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, hidden_size, max_rank),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        ref_out_tensor = torch.rand(
+            (total_tokens, hidden_size),
+            dtype=dtype,
+        ).to(device)
+        # Ensure the same input.
+        our_out_tensor = ref_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, )).to(device)
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]].copy_(lora_index)
+        current_offset += seq_len_tensor[b_id].item()
+    return (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
+                                     seq_length, dtype, nslices, device):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    inputs_tensor = torch.rand(
+        (total_tokens, max_rank),
+        dtype=dtype,
+    ).to(device)
+    lora_weights_lst = []
+    for _ in range(nslices):
+        lora_weights_lst.append(
+            torch.rand(
+                (lora_nums, hidden_size, max_rank),  # col-major
+                dtype=dtype,
+            ).to(device))
+    # expand op needs to complete y+=a@lora_b, so output is
+    # initinized randomly
+    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                dtype=dtype).to(device)
+    # Ensure the same input.
+    our_out_tensor = ref_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = lora_index.item()
+        current_offset += seq_len_tensor[b_id].item()
+
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return (
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9e09b9a32eab9..6cd77f75cae8d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -13,12 +13,9 @@
 except ImportError as e:
     logger.warning("Failed to import from vllm._C with %r", e)
 
-with contextlib.suppress(ImportError):
-    import vllm._moe_C
-
 with contextlib.suppress(ImportError):
     # ruff: noqa: F401
-    import vllm._punica_C
+    import vllm._moe_C
 
 
 def is_custom_op_supported(op_name: str) -> bool:
@@ -519,43 +516,6 @@ def register_graph_buffers(fa: int, handles: List[str],
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
 
-# punica
-def dispatch_bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-) -> None:
-    torch.ops._punica_C.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx,
-                                      scale)
-
-
-def dispatch_bgmv_low_level(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-    h_in: int,
-    h_out: int,
-    y_offset: int,
-) -> None:
-    torch.ops._punica_C.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        h_in,
-        h_out,
-        y_offset,
-    )
-
-
 # temporary fix for https://github.com/vllm-project/vllm/issues/5456
 # TODO: remove this in v0.6.0
 names_and_values = globals()
diff --git a/vllm/envs.py b/vllm/envs.py
index aef7ac385ec66..9bcb26f8e5a64 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,7 +45,6 @@
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
-    VLLM_INSTALL_PUNICA_KERNELS: bool = False
     VLLM_NO_DEPRECATION_WARNING: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
@@ -94,10 +93,6 @@ def get_default_config_root():
     "VLLM_USE_PRECOMPILED":
     lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
 
-    # If set, vllm will install Punica kernels
-    "VLLM_INSTALL_PUNICA_KERNELS":
-    lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
-
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
     # Available options: "Debug", "Release", "RelWithDebInfo"
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index d27171f720832..a7887a048746a 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -14,7 +14,6 @@
                               MergedQKVParallelLinearWithLora,
                               QKVParallelLinearWithLora,
                               RowParallelLinearWithLoRA)
-from vllm.lora.punica import bgmv, dispatch_bgmv_low_level
 
 if TYPE_CHECKING:
     pass
@@ -28,7 +27,7 @@ def _fully_sharded_can_replace(can_replace):
 
     def dec(*args, **kwargs):
         return (can_replace(*args, **kwargs)
-                and kwargs['lora_config'].fully_sharded_loras)
+                and kwargs["lora_config"].fully_sharded_loras)
 
     return dec
 
@@ -59,25 +58,30 @@ def apply(self, x: torch.Tensor,
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
-
-        bgmv(buffer, x, self.lora_a_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        buffer = torch.zeros(
+            (x.shape[0], self.lora_a_stacked.shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_gather(buffer)
-        bgmv(output, buffer, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        self.punica_wrapper.add_expand(output,
+                                       buffer,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         # now have column partitioned output
-
         output = output.view(*out_orig_shape)
         return output
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -88,14 +92,14 @@ def can_replace_layer(cls, source_layer: nn.Module,
         )
 
 
-def _mcp_apply(x, bias, layer):
+def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
     """
-    MergedColumnParallelLinearWithShardedLoRA and 
-    MergedQKVParallelLinearWithShardedLora share the same 
+    MergedColumnParallelLinearWithShardedLoRA and
+    MergedQKVParallelLinearWithShardedLora share the same
     LoRa weight application method.
     
     The main difference is the step by shard_size for lora_b which can
-    vary for MergedQKVParallelLinearWithShardedLora but is constant for 
+    vary for MergedQKVParallelLinearWithShardedLora but is constant for
     MergedColumnParallelLinearWithShardedLoRA.
     """
     # expecting 2 for column parallel and 3 for qkv
@@ -104,21 +108,27 @@ def _mcp_apply(x, bias, layer):
 
     x = x.view(-1, x.shape[-1])
     output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
-    buffers = torch.zeros((n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
-                          dtype=torch.float32,
-                          device=x.device)
+    buffers = torch.zeros(
+        (n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
     for idx in range(n):
-        bgmv(buffers[idx], x, layer.lora_a_stacked[idx],
-             layer.indices[:layer.indices_len[0]], 0, 1.0)
+        layer.punica_wrapper.add_shrink(buffers[idx], x,
+                                        layer.lora_a_stacked[idx], 1.0)
 
     buffers = tensor_model_parallel_all_gather(buffers)
     left_offset = 0
     for idx in range(n):
         shard_size = layer.lora_b_stacked[idx].shape[2]
-        dispatch_bgmv_low_level(output, buffers[idx],
-                                layer.lora_b_stacked[idx],
-                                layer.indices[:layer.indices_len[0]], 0, 1.0,
-                                left_offset, shard_size)
+        layer.punica_wrapper.add_expand_slice(
+            output,
+            buffers[idx],
+            layer.lora_b_stacked[idx],
+            left_offset,
+            shard_size,
+            add_input=True,
+        )
         left_offset += shard_size
 
     output = output.view(*out_orig_shape)
@@ -129,7 +139,7 @@ def _mcp_apply(x, bias, layer):
 class MergedColumnParallelLinearWithShardedLoRA(
         MergedColumnParallelLinearWithLoRA):
     """
-    Differs from MergedColumnParallelLinearWithLoRA by slicing the 
+    Differs from MergedColumnParallelLinearWithLoRA by slicing the
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
@@ -145,7 +155,8 @@ def slice_lora_a(
         lora_a = [
             lora_a[0][:,
                       output_start_idx:output_start_idx + output_shard_size],
-            lora_a[1][:, output_start_idx:output_start_idx + output_shard_size]
+            lora_a[1][:,
+                      output_start_idx:output_start_idx + output_shard_size],
         ]
         return lora_a
 
@@ -155,9 +166,13 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -170,7 +185,7 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
     """
-    Differs from QKVParallelLinearWithLora by slicing the 
+    Differs from QKVParallelLinearWithLora by slicing the
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
@@ -193,14 +208,13 @@ def apply(self, x: torch.Tensor,
         buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
                              dtype=torch.float32,
                              device=x.device)
-
-        bgmv(buffer, x, self.lora_a_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_gather(buffer)
-        bgmv(output, buffer, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        self.punica_wrapper.add_expand(output,
+                                       buffer,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         # now have column partitioned output
-
         output = output.view(*out_orig_shape)
         return output
 
@@ -237,7 +251,7 @@ def slice_lora_a(
         lora_a = [
             lora_a[0][:, start_idx[0]:start_idx[0] + shard_size[0]],
             lora_a[1][:, start_idx[1]:start_idx[1] + shard_size[1]],
-            lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]]
+            lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]],
         ]
         return lora_a
 
@@ -247,9 +261,13 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -262,11 +280,11 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
     """
-    Differs from RowParallelLinearWithLoRA by slicing the 
+    Differs from RowParallelLinearWithLoRA by slicing the
     LoRA B's also.
 
     Based on S-LoRA, slicing happens along the output dim.
-    This yields a combined partial sum from the row parallel base 
+    This yields a combined partial sum from the row parallel base
     layer and column partitioned output from the LoRA.
     """
 
@@ -283,11 +301,13 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
-        bgmv(buffer, x, self.lora_a_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        buffer = torch.zeros(
+            (x.shape[0], self.lora_a_stacked.shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -298,18 +318,21 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # reduced before being used
         shard_size = self.lora_b_stacked.shape[2]
         start_idx = self.tp_rank * shard_size
-        dispatch_bgmv_low_level(output, buffer, self.lora_b_stacked,
-                                self.indices[:self.indices_len[0]], 0, 1.0,
-                                start_idx, shard_size)
-
+        self.punica_wrapper.add_expand_slice(output, buffer,
+                                             self.lora_b_stacked, start_idx,
+                                             shard_size)
         output = output.view(*out_orig_shape)
         return output
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 87de285a373a2..3176badabbc7f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -17,7 +17,7 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-from vllm.lora.punica import add_lora, add_lora_slice, bgmv
+from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -55,88 +55,17 @@ def _not_fully_sharded_can_replace(can_replace):
     """
 
     def dec(*args, **kwargs):
-        decorate = kwargs.pop('decorate') if 'decorate' in kwargs else True
-        condition = (not kwargs['lora_config'].fully_sharded_loras
+        decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
+        condition = (not kwargs["lora_config"].fully_sharded_loras
                      if decorate else True)
         return can_replace(*args, **kwargs) and condition
 
     return dec
 
 
-def _apply_lora(
-    x: torch.Tensor,
-    lora_a_stacked: torch.Tensor,
-    lora_b_stacked: torch.Tensor,
-    indices: torch.Tensor,
-    output: torch.Tensor,
-):
-    """Applies lora to each input.
-
-    This method applies all loras to each input. It uses the
-    indices vector to determine which lora yields the
-    correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the
-    output.
-
-    Input shapes:
-        x:               (batch_size, hidden_dim)
-        lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked:  (num_loras, output_dim, lora_rank)
-        indices:         (batch_size)
-        output:          (batch_size, output_dim)
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-    add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0)
-    return output.view_as(org_output)
-
-
-def _apply_lora_packed_nslice(
-    x: torch.Tensor,
-    lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    output_slices: Tuple[int, ...],
-):
-    """Applies lora to each input.
-
-    This method applies all loras to each input. It uses the
-    indices vector to determine which lora yields the
-    correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the
-    output.
-
-    This method is used for layers that are composed of multiple sublayers
-    (slices) packed together.
-
-    Input shapes:
-        x:                 (batch_size, hidden_dim)
-        lora_a_stacked:    3 element tuple of (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked:    3 element tuple of (num_loras, output_dim, lora_rank)
-        indices:           (batch_size)
-        output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...),
-                           where n is number of slices
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-    offset_left = 0
-    for slice_idx in range(len(output_slices)):
-        add_lora_slice(output, x, lora_a_stacked[slice_idx],
-                       lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left,
-                       output_slices[slice_idx])
-        offset_left += output_slices[slice_idx]
-    return output.view_as(org_output)
-
-
 @dataclass
 class LoRAMapping(AdapterMapping):
-    pass
+    is_prefill: bool = False
 
 
 class BaseLayerWithLoRA(nn.Module):
@@ -154,10 +83,11 @@ def slice_lora_b(
         ...
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         """Initializes lora matrices."""
         ...
 
@@ -177,20 +107,18 @@ def set_lora(
 
     def set_mapping(
         self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
+        punica_wrapper: PunicaWrapper,
     ):
-        """Sets the mapping indices."""
-        ...
+        self.punica_wrapper: PunicaWrapper = punica_wrapper
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
 
@@ -259,10 +187,6 @@ def create_lora_weights(
             self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
             self.lora_a_stacked.shape[2],
         )
-        # Lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-        self.embeddings_indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -285,40 +209,27 @@ def set_lora(
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1]].copy_(embeddings_tensor, non_blocking=True)
+                shape[1], ].copy_(embeddings_tensor, non_blocking=True)
             if self.embeddings_slice is not None:
                 # TODO(yard1): Optimize this copy, we don't need to copy
                 # everything, just the modified part
                 embeddings = self.embeddings_tensors.view(
                     self.embeddings_tensors.shape[0] *
                     self.embeddings_tensors.shape[1],
-                    self.embeddings_tensors.shape[2]
+                    self.embeddings_tensors.shape[2],
                 )[self.embeddings_slice[0]:self.embeddings_slice[1]]
                 assert self.embeddings_weights is not None
                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.embeddings_indices = embeddings_indices
-        self.indices_len = indices_len
-
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
-        embedding_len = self.indices_len[3]
-        indices = self.embeddings_indices[1][:embedding_len].view_as(x)
+        embeddings_indices = self.punica_wrapper.embeddings_indices
+        indices = embeddings_indices[1].view_as(x)
         full_lora_a_embeddings = F.embedding(
             x + indices,
             self.lora_a_stacked_2d,
         )
-        indices = self.embeddings_indices[0][:embedding_len].view_as(x)
+        indices = embeddings_indices[0].view_as(x)
         full_output = self.base_layer.forward(
             x.add_(indices * added_tokens_mask))
 
@@ -329,22 +240,32 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if full_lora_a_embeddings.ndim == 3:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
                 full_lora_a_embeddings.shape[0] *
-                full_lora_a_embeddings.shape[1], -1)
-        bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+                full_lora_a_embeddings.shape[1],
+                -1,
+            )
+
+        # Embedding layer only need expand op
+        self.punica_wrapper.add_expand(full_output,
+                                       full_lora_a_embeddings,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         return full_output.view_as(full_output_org)
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
 
 class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
-    
+
     LoRA B is sliced for tensor parallelism.
     """
 
@@ -357,10 +278,11 @@ def __init__(self, base_layer: ColumnParallelLinear) -> None:
         self.device = _get_lora_device(self.base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_size = get_tensor_model_parallel_world_size()
         lora_a_output_size_per_partition = (
@@ -384,10 +306,6 @@ def create_lora_weights(
         )
         self.output_dim = self.lora_b_stacked.shape[2]
 
-        # lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -423,28 +341,11 @@ def set_lora(
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.indices_len = indices_len
-
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-        )
+        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
+                                     self.lora_b_stacked, 1.0)
         return output
 
     def forward(self, input_):
@@ -473,9 +374,13 @@ def forward(self, input_):
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
             and len(packed_modules_list) == 1)
@@ -494,10 +399,11 @@ def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
         super().__init__(base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         n_slices = 2
         if not (len(self.base_layer.output_sizes) == n_slices
@@ -533,8 +439,6 @@ def create_lora_weights(
             ) for _ in range(n_slices))
 
         self.output_dim = self.lora_b_stacked[0].shape[2]
-        # Lazily initialized.
-        self.indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -556,7 +460,8 @@ def slice_lora_b(
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         lora_b = [
-            lora_b[0][:, start_idx:end_idx], lora_b[1][:, start_idx:end_idx]
+            lora_b[0][:, start_idx:end_idx],
+            lora_b[1][:, start_idx:end_idx],
         ]
         return lora_b
 
@@ -591,34 +496,33 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_packed_nslice(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-            (self.output_dim, self.output_dim),
-        )
+        self.punica_wrapper.add_lora_packed_nslice(
+            output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0,
+            (self.output_dim, self.output_dim))
         return output
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is MergedColumnParallelLinear and len(
-            packed_modules_list) == 2
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is MergedColumnParallelLinear
+                and len(packed_modules_list) == 2)
 
 
 class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
     """
-    ColumnParallelLinear layer that is specifically designed for  
-    qkv_proj. Certain models, such as chtglm3 and baichuan-7b,  
-    only contains a single LoRA within their qkv_proj layer. 
+    ColumnParallelLinear layer that is specifically designed for
+    qkv_proj. Certain models, such as chtglm3 and baichuan-7b,
+    only contains a single LoRA within their qkv_proj layer.
 
-    During inference with Tensor Parallel, the weights of lora_b 
+    During inference with Tensor Parallel, the weights of lora_b
     must be accurately partitioned according to the respective ranks.
-    
+
     Q slice may have different shape than K and V slices (which both have
     the same shape).
     """
@@ -696,10 +600,11 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -767,11 +672,15 @@ def create_lora_weights(
             ),
         )
 
-        self.output_slices = (self.q_proj_shard_size, self.kv_proj_shard_size,
-                              self.kv_proj_shard_size)
+        self.output_slices = (
+            self.q_proj_shard_size,
+            self.kv_proj_shard_size,
+            self.kv_proj_shard_size,
+        )
         self.packed_indices: Optional[torch.Tensor] = None
         self.standard_indices: Optional[torch.Tensor] = None
         # lazily initialized.
+        self.indices: torch.Tensor
         self.indices_len: List[int]
 
     def reset_lora(self, index: int):
@@ -794,15 +703,15 @@ def slice_lora_b(
         if lora_b[0] is not None:
             lora_b_q = lora_b[0][:, self.q_proj_shard_size *
                                  self.q_shard_id:self.q_proj_shard_size *
-                                 (self.q_shard_id + 1)]
+                                 (self.q_shard_id + 1), ]
         if lora_b[1] is not None:
             lora_b_k = lora_b[1][:, self.kv_proj_shard_size *
                                  self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1)]
+                                 (self.kv_shard_id + 1), ]
         if lora_b[2] is not None:
             lora_b_v = lora_b[2][:, self.kv_proj_shard_size *
                                  self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1)]
+                                 (self.kv_shard_id + 1), ]
         lora_b = [lora_b_q, lora_b_k, lora_b_v]
         return lora_b
 
@@ -851,23 +760,23 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_packed_nslice(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-            self.output_slices,
-        )
+        self.punica_wrapper.add_lora_packed_nslice(output, x,
+                                                   self.lora_a_stacked,
+                                                   self.lora_b_stacked, 1.0,
+                                                   self.output_slices)
         return output
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is QKVParallelLinear and len(
-            packed_modules_list) == 3
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is QKVParallelLinear
+                and len(packed_modules_list) == 3)
 
 
 class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
@@ -880,10 +789,11 @@ def __init__(self, base_layer: RowParallelLinear) -> None:
         self.device = _get_lora_device(self.base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_rank = get_tensor_model_parallel_rank()
         self.lora_a_stacked = torch.zeros(
@@ -911,9 +821,6 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-        # Lazily initialized
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -950,27 +857,10 @@ def set_lora(
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.indices_len = indices_len
-
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
-        _apply_lora(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-        )
+        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
+                                     self.lora_b_stacked, 1.0)
         return output
 
     def forward(self, input_):
@@ -1013,14 +903,18 @@ def forward(self, input_):
 
     @property
     def weight(self):
-        return self.base_layer.weight if hasattr(
-            self.base_layer, "weight") else self.base_layer.qweight
+        return (self.base_layer.weight if hasattr(self.base_layer, "weight")
+                else self.base_layer.qweight)
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is RowParallelLinear
 
 
@@ -1125,10 +1019,6 @@ def create_lora_weights(
                 dtype=torch.long)
         else:
             self.sharded_to_full_mapping_gpu = None
-        # Lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-        self.indices_padded: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -1154,19 +1044,6 @@ def set_lora(
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
                 shape[1], ] = embeddings_tensor
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = sampler_indices
-        self.indices_padded = sampler_indices_padded
-        self.indices_len = indices_len
-
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
@@ -1212,38 +1089,37 @@ def _get_logits(
                      out=lora_logits[:-1])
         lora_logits[-1] = float("-inf")
         lora_logits = lora_logits.mT
+        indices_padded = self.punica_wrapper.sampler_indices_padded
         lora_logits = (lora_logits.reshape(
             lora_logits.shape[0] * lora_logits.shape[1],
             lora_logits.shape[2],
-        ).index_select(0,
-                       self.indices_padded[:self.indices_len[2]]).nan_to_num_(
-                           nan=float("-inf"),
-                           posinf=float("inf"),
-                           neginf=float("-inf")))
+        ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
+                                                      posinf=float("inf"),
+                                                      neginf=float("-inf")))
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
-               lora_logits.shape[1]] = lora_logits
-
-        _apply_lora(
-            hidden_states,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[1]],
-            logits,
-        )
+               lora_logits.shape[1], ] = lora_logits
+
+        # LogitsProcessorWithLoRA always using bgmv
+        self.punica_wrapper.add_lora_logits(logits, hidden_states,
+                                            self.lora_a_stacked,
+                                            self.lora_b_stacked, 1.0)
 
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
-
         return logits
 
     def forward(self, *args, **kwargs):
         return type(self.base_layer).forward(self, *args, **kwargs)
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
 
@@ -1259,9 +1135,6 @@ class LinearScalingRotaryEmbeddingWithLora(BaseLayerWithLoRA):
     def __init__(self, base_layer: RotaryEmbedding) -> None:
         super().__init__()
         self.base_layer = base_layer
-        # Lazily initialized
-        self.long_lora_indices: torch.Tensor
-        self.indices_len: List[int]
 
     @property
     def scaling_factors(self):
@@ -1277,9 +1150,8 @@ def create_lora_weights(
         lora_config: LoRAConfig,
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
-        scaling_factors = list(
-            lora_config.long_lora_scaling_factors
-        ) if lora_config.long_lora_scaling_factors else []
+        scaling_factors = (list(lora_config.long_lora_scaling_factors)
+                           if lora_config.long_lora_scaling_factors else [])
         base_scaling_factor = (self.base_layer.scaling_factor if isinstance(
             self.base_layer, LinearScalingRotaryEmbedding) else 1.0)
         scaling_factors = sorted(
@@ -1306,18 +1178,6 @@ def set_lora(
     ):
         ...
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.long_lora_indices = long_lora_indices
-        self.indices_len = indices_len
-
     def forward(
         self,
         positions: torch.Tensor,
@@ -1328,19 +1188,24 @@ def forward(
             positions,
             query,
             key,
-            offsets=self.long_lora_indices[:self.indices_len[4]])
+            offsets=self.punica_wrapper.long_lora_indices,
+        )
 
     @property
     def scaling_factor_to_offset(self) -> Dict[float, int]:
         return self.base_layer.scaling_factor_to_offset
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-        return type(source_layer) is LinearScalingRotaryEmbedding or type(
-            source_layer) is RotaryEmbedding
+        return (type(source_layer) is LinearScalingRotaryEmbedding
+                or type(source_layer) is RotaryEmbedding)
 
     def extra_repr(self) -> str:
         return self.base_layer.extra_repr()
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e1ede7d4d710a..017a1002bb9a7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type
 
 import safetensors.torch
 import torch
@@ -21,6 +21,7 @@
                               LinearScalingRotaryEmbeddingWithLora,
                               LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.punica import PunicaWrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
@@ -43,115 +44,6 @@ class LongContextLoRAContext:
     offsets_by_lora_id: Dict[int, int] = field(default_factory=dict)
 
 
-def convert_mapping(
-    mapping: LoRAMapping,
-    lora_index_to_id: List[Optional[int]],
-    max_loras: int,
-    vocab_size: int,
-    extra_vocab_size: int,
-    long_lora_context: Optional[LongContextLoRAContext] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor], List[int]]:
-    """Converts LoRAMapping to index tensors.
-
-    Args:
-        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
-        lora_index_to_id: List mapping LoRA ids to LoRA indices.
-        max_loras: Maximum number of LoRAs.
-        vocab_size: Model vocab size.
-        extra_vocab_size: Extra vocab size each LoRA can have.
-        long_lora_context: Passed if there are long context lora in a batch.
-
-    Returns:
-        A tuple of tensors:
-            base_indices: Tensor of shape [batch_size] mapping batch rows to
-                LoRA indices.
-            sampler_indices: Tensor of shape [batch_size] mapping requests to
-                LoRA indices for sampler. For generation, this will be the
-                same as base_indicies. For prefill, this will map requests
-                to LoRA indices.
-            sampler_indices_padded: Tensor of shape [batch_size] mapping
-                requests to LoRA indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
-                max_loras.
-            embeddings_indices: Tensor of shape [2, batch_size] mapping
-                requests to embedding indices. First row is for embeddings
-                added by the LoRAs, second row is for the LoRA.lora_a
-                embeddings.
-            long_lora_indices: Tensor of shape [batch_size] mapping
-                requests to RoPE offsets and rot dims for long LoRAs.
-                None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors.
-                Used to index into each tensor. It contains length for
-                (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices). If long_lora doesn't
-                exist, it only contains first 4 entries.
-    """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
-    embedding_indices = index_mapping_indices.copy()
-    lora_indices = index_mapping_indices.copy()
-    long_lora_offsets: Optional[torch.Tensor] = None
-    if long_lora_context:
-        long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device="cuda",
-                                        dtype=torch.long)
-    prompt_mapping: List[int] = [
-        lora_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
-    ]
-    lora_idx = None
-    for i in range(len(index_mapping_indices)):
-        # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
-        lora_indices[i] = lora_idx
-        if long_lora_context:
-            assert long_lora_offsets is not None
-            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
-                index_mapping_indices[i], 0)
-            long_lora_offsets[i] = lora_offset
-
-    indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices, lora_indices, embedding_indices
-    ]
-    if long_lora_context:
-        assert long_lora_offsets is not None
-        indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device="cuda",
-                                         dtype=torch.long)
-    embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size)
-    ])
-    embeddings_indices[embeddings_indices == -1] = max_loras - 1
-    base_indices = indices[1]
-    sampler_indices = prompt_mapping_tensor
-    sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = (
-        torch.arange(
-            0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
-        (sampler_indices_padded * len(sampler_indices_padded)))
-    long_lora_indices = None
-    long_lora_indices_len: Optional[int] = None
-    if long_lora_context:
-        long_lora_indices = indices[3]
-        long_lora_indices_len = long_lora_indices.shape[-1]
-    # Contain length of indices tensors. Used to index into each tensor.
-    indices_len = [
-        base_indices.shape[-1], sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
-    ]
-    if long_lora_indices_len is not None:
-        indices_len.append(long_lora_indices_len)
-
-    return (base_indices, sampler_indices, sampler_indices_padded,
-            embeddings_indices, long_lora_indices, indices_len)
-
-
 def get_lora_id():
     global _GLOBAL_LORA_ID
     _GLOBAL_LORA_ID += 1
@@ -422,29 +314,12 @@ def __init__(
         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
-        self.base_indices = torch.empty(self.max_num_batched_tokens,
-                                        dtype=torch.long,
-                                        device="cuda")
-        self.sampler_indices = torch.empty(self.max_num_batched_tokens,
-                                           dtype=torch.long,
-                                           device="cuda")
-        self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens,
-                                                  dtype=torch.long,
-                                                  device="cuda")
-        self.embeddings_indices = torch.empty(2,
-                                              self.max_num_batched_tokens,
-                                              dtype=torch.long,
-                                              device="cuda")
-        self.long_lora_indices = torch.empty(self.max_num_batched_tokens,
-                                             dtype=torch.long,
-                                             device="cuda")
+        self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
+                                            max_batches=self.max_num_seqs,
+                                            device="cuda")
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
-        # 4 is the number of indicies tensors defined above
-        # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices
-        self.indices_len: List[Optional[int]] = [None] * 4
         super().__init__(model)
         if hasattr(self.model, "supported_lora_modules"):
             self.supported_lora_modules = copy.deepcopy(
@@ -536,28 +411,16 @@ def pin_adapter(self, lora_id: int) -> bool:
             "Pinning is not supported in LoRAModelManager."
             "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
 
-    # TODO see if this can be vectorized
     def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
-        (base_indices, sampler_indices, sampler_indices_padded,
-         embeddings_indices, long_lora_offsets_tensor,
-         indices_len) = convert_mapping(mapping, self.lora_index_to_id,
-                                        self.lora_slots + 1, self.vocab_size,
-                                        self.lora_config.lora_extra_vocab_size,
-                                        self.long_lora_context)
-        self.base_indices[:base_indices.shape[0]].copy_(base_indices)
-        self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        self.embeddings_indices[:embeddings_indices.
-                                shape[0], :embeddings_indices.shape[1]].copy_(
-                                    embeddings_indices)
-        if long_lora_offsets_tensor is not None:
-            self.long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
-                long_lora_offsets_tensor)
-        else:
-            self.long_lora_indices.zero_()
-        # Maintain the reference
-        self.indices_len[:] = indices_len
+        # update lora states
+        self.punica_wrapper.update_metadata(
+            mapping,
+            self.lora_index_to_id,
+            self.lora_slots + 1,
+            self.vocab_size,
+            self.lora_config.lora_extra_vocab_size,
+            self.long_lora_context,
+        )
 
     def remove_all_adapters(self):
         """Remove all LoRAModels from the manager."""
@@ -595,10 +458,8 @@ def _create_lora_modules(self):
                                                 self.model.config))
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
-            new_module.set_mapping(self.base_indices, self.sampler_indices,
-                                   self.sampler_indices_padded,
-                                   self.embeddings_indices,
-                                   self.long_lora_indices, self.indices_len)
+            # All lora layers share the same punica_wrapper based on reference.
+            new_module.set_mapping(self.punica_wrapper)
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA)
diff --git a/vllm/lora/ops/__init__.py b/vllm/lora/ops/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
new file mode 100644
index 0000000000000..dcaf2e3d462cc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -0,0 +1,169 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Dict, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from .utils import get_lora_op_configs
+
+
+@triton.jit
+def _bgmv_expand_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    lora_indices,
+    xm_stride,
+    xk_stride,
+    l0_stride,
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    SPLIT_N: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
+    performance
+    """
+    pid_sn = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    offset_k = tl.arange(0, BLOCK_K)
+    offset_n = tl.arange(0, BLOCK_N)
+    if EVEN_K:
+        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
+                          offset_k * xk_stride, )  # [BLOCK_K]
+    else:
+        tiled_a = tl.load(
+            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+            mask=offset_k < K,
+            other=0,
+        )  # [BLOCK_K]
+    # N must be divisible by SPLIT_N
+    split_n_length = tl.cdiv(N, SPLIT_N)
+    if CAST_TYPE:
+        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+    # sliding  to  next row-block
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             pid_sn * split_n_length * lora_k_stride)
+    c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
+    for n in range(0, split_n_length, BLOCK_N):
+        current_n = n + offset_n
+        current_n_c = tl.max_contiguous(current_n, BLOCK_N)
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
+                                                              < K)
+        c_mask = current_n < split_n_length
+        tiled_b = tl.load(
+            b_ptr + current_n_c[:, None] * lora_k_stride +
+            offset_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
+        if ADD_INPUTS:
+            tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask)
+            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
+        else:
+            accumulator = tl.sum(tiled_a * tiled_b, 1)
+
+        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+    override_config: Optional[Dict[str, int]] = None,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch, An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None. 
+            Triton grid config
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+    BLOCK_K = triton.next_power_of_2(K)
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    batches = lora_indices_tensor.size(0)
+    if override_config:
+        config = override_config
+    else:
+        config = get_lora_op_configs("expand", batches, N)
+    grid = lambda META: (
+        META["SPLIT_N"],
+        batches,
+    )
+    _bgmv_expand_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_K=BLOCK_K,
+        EVEN_K=EVEN_K,
+        ADD_INPUTS=ADD_INPUTS,
+        CAST_TYPE=CAST_TYPE,
+        **config,
+    )
+    return
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
new file mode 100644
index 0000000000000..fa6571074f3ab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -0,0 +1,182 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Dict, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from .utils import get_lora_op_configs
+
+
+@triton.jit
+def _bgmv_expand_slice_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    lora_indices,
+    xm_stride,
+    xk_stride,
+    l0_stride,
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    slice_offset,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    SPLIT_N: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
+    performance
+    """
+    pid_sn = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    offset_k = tl.arange(0, BLOCK_K)
+    offset_n = tl.arange(0, BLOCK_N)
+    if EVEN_K:
+        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
+                          offset_k * xk_stride, )  # [BLOCK_K]
+    else:
+        tiled_a = tl.load(
+            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+            mask=offset_k < K,
+            other=0,
+        )  # [BLOCK_K]
+    # N must be divisible by SPLIT_N
+    split_n_length = tl.cdiv(N, SPLIT_N)
+    if CAST_TYPE:
+        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+    # sliding  to  next row-block
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             pid_sn * split_n_length * lora_k_stride)
+    c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
+             slice_offset * cn_stride)
+
+    for n in range(0, split_n_length, BLOCK_N):
+        current_n = n + offset_n
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
+                                                              < K)
+        c_mask = current_n < split_n_length
+        tiled_b = tl.load(
+            b_ptr + current_n[:, None] * lora_k_stride +
+            offset_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
+
+        if ADD_INPUTS:
+            tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask)
+            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
+        else:
+            accumulator = tl.sum(tiled_a * tiled_b, 1)
+
+        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+    override_config: Optional[Dict[str, int]] = None,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'b weight
+        output_tensor (torch.Tensor): output tensor
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch, An index of -1 means no lora should be
+            applied.
+        slice_offst (int): output_tensor's offst
+        slice_size (int): current output_tensor's size
+        batches (int): batch size
+        add_inputs (bool, optional): Defaults to False.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None.
+            Triton grid config
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+
+    assert slice_size == lora_b_weights.size(-2)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+    BLOCK_K = triton.next_power_of_2(K)
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+
+    batches = lora_indices_tensor.size(0)
+
+    if override_config:
+        config = override_config
+    else:
+        config = get_lora_op_configs("expand", batches, N)
+
+    grid = lambda META: (
+        META["SPLIT_N"],
+        batches,
+    )
+    _bgmv_expand_slice_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        slice_offset,
+        BLOCK_K=BLOCK_K,
+        EVEN_K=EVEN_K,
+        ADD_INPUTS=ADD_INPUTS,
+        CAST_TYPE=CAST_TYPE,
+        **config,
+    )
+    return
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
new file mode 100644
index 0000000000000..e69d33078f5aa
--- /dev/null
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -0,0 +1,150 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Dict, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from .utils import get_lora_op_configs
+
+
+@triton.jit
+def _bgmv_shrink_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    lora_indices,
+    scaling,
+    xm_stride,
+    xk_stride,
+    l0_stride,
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
+    """
+    GroupGEMV, additionally, introducing SPLIT-K can improve large hidden_size's
+    performance
+    """
+    pid_sk = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+
+    offset_n = tl.arange(0, BLOCK_N)
+    offset_k = tl.arange(0, BLOCK_K) + pid_sk * BLOCK_K
+    a_ptr = input_ptr + cur_batch * xm_stride
+    b_ptr = lora_ptr + l0_stride * lora_index
+    accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32)
+    for k in range(0, K, BLOCK_K * SPLIT_K):
+        current_k = k + offset_k
+        current_k_c = tl.max_contiguous(current_k, BLOCK_K)
+        tiled_a = tl.load(
+            a_ptr + current_k_c,
+            mask=current_k < K,
+            other=0.0,
+        )  # [BLOCK_K]
+        b_ptr_mask = (offset_n[:, None] < N) & (current_k[None, :] < K)
+
+        tiled_b = tl.load(
+            b_ptr + offset_n[:, None] * lora_k_stride +
+            current_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
+
+        accumulator += tl.sum(tiled_a * tiled_b, 1)
+    accumulator *= scaling
+    offset_cn = tl.arange(0, BLOCK_N)
+    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn * cn_stride
+    c_mask = offset_cn < N
+    if SPLIT_K == 1:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+    override_config: Optional[Dict[str, int]] = None,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_a_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        scaling (float):  Scaling factor.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None. 
+            Triton grid config
+    """
+    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    assert lora_a_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert inputs.is_contiguous()
+
+    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
+        assert lora_a_weights.size(1) == 1
+        lora_a_weights = lora_a_weights.squeeze(dim=1)
+    else:
+        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
+    assert lora_a_weights.is_contiguous()
+    assert output_tensor.is_contiguous()
+    # TODO tuning this config
+    batches = lora_indices_tensor.size(0)
+    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
+    BLOCK_N = triton.next_power_of_2(N)
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        config = get_lora_op_configs("bgmv_shrink", batches, K)
+
+    grid = lambda META: (
+        META["SPLIT_K"],
+        batches,
+    )
+    _bgmv_shrink_kernel[grid](
+        inputs,
+        lora_a_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        scaling,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_a_weights.stride(0),
+        lora_a_weights.stride(1),
+        lora_a_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_N=BLOCK_N,
+        **config,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
new file mode 100644
index 0000000000000..4590495469096
--- /dev/null
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -0,0 +1,192 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.triton_utils import libentry
+
+
+@libentry()
+@triton.jit
+def _sgmv_expand_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    xm_stride,
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+    The sgmv's expand triton kernel is based on GroupGEMM.
+    """
+    pid = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = tl.arange(0, BLOCK_K)
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride, )
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < K - k * BLOCK_K,
+                              other=0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < K - k * BLOCK_K,
+                              other=0)
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+        accumulator += tl.dot(
+            tiled_a,
+            tiled_b,
+        )
+        a_ptr += BLOCK_K * xk_stride
+        b_ptr += BLOCK_K * lora_n_stride
+    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+    M = tl.load(seq_lens + cur_batch)
+    c_mask = (offset_cm[:, None] <
+              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        tiled_c += tiled_out
+    tl.store(c_ptr, tiled_c, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    add_inputs: bool = False,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4, 10].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output.
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_K = 16
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    grid = (
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        batches,
+    )
+    _sgmv_expand_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
new file mode 100644
index 0000000000000..ff3bcda071b80
--- /dev/null
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -0,0 +1,205 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.triton_utils import libentry
+
+
+@libentry()
+@triton.jit
+def _sgmv_expand_slice_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    xm_stride,
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    slice_offset,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+
+    Similar to the 'sgmv_expand' operator, but with an added parameter 
+    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator 
+    might be that in the future, we could implement a fusion operator to 
+    achieve the current functionality instead of having to call it multiple 
+    times.
+    """
+    pid = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = tl.arange(0, BLOCK_K)
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride, )
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < K - k * BLOCK_K,
+                              other=0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < K - k * BLOCK_K,
+                              other=0)
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+        accumulator += tl.dot(
+            tiled_a,
+            tiled_b,
+        )
+        a_ptr += BLOCK_K * xk_stride
+        b_ptr += BLOCK_K * lora_n_stride
+    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_offset
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+    M = tl.load(seq_lens + cur_batch)
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] <
+                                                           (slice_offset + N))
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        tiled_c += tiled_out
+    tl.store(c_ptr, tiled_c, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = False,
+):
+    """_summary_
+
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4, 10].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        slice_offst (int): output_tensor's offst
+        slice_size (int): current output_tensor's size
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output..
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
+    assert slice_size == lora_b_weights.size(-2)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_K = 16
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    grid = (
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        batches,
+    )
+    _sgmv_expand_slice_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        slice_offset,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
new file mode 100644
index 0000000000000..8ab049989abef
--- /dev/null
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -0,0 +1,189 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.triton_utils import libentry
+
+
+@libentry()
+@triton.jit
+def _sgmv_shrink_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    scaling,
+    xm_stride,  # hidden_size
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
+    """
+    The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K.
+    The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally,
+    introducing SPLIT-K can improve performance
+    """
+    pid = tl.program_id(axis=0)
+    pid_sk = tl.program_id(axis=1)
+    cur_batch = tl.program_id(axis=2)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
+
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride)
+    b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
+             offset_k[:, None] * lora_n_stride)
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            k_remaining = K - k * (BLOCK_K * SPLIT_K)
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < k_remaining,
+                              other=0.0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < k_remaining,
+                              other=0.0)
+        accumulator += tl.dot(tiled_a, tiled_b)
+
+        a_ptr += BLOCK_K * SPLIT_K * xk_stride
+        b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+    c_mask = (offset_cm[:, None] <
+              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    accumulator *= scaling
+    # handles write-back with reduction-splitting
+    if SPLIT_K == 1:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    scaling: float,
+):
+    """
+
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_a_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        scaling (float):  Scaling factor.
+    """
+    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    assert lora_a_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
+    assert inputs.is_contiguous()
+
+    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
+        assert lora_a_weights.size(1) == 1
+        lora_a_weights = lora_a_weights.squeeze(dim=1)
+    else:
+        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
+    assert lora_a_weights.is_contiguous()
+    assert output_tensor.is_contiguous()
+    # TODO tuning this config
+    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
+    BLOCK_M = 32
+    BLOCK_N = 16
+    BLOCK_K = 32
+    SPLIT_K = 8
+    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
+    grid = (
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        SPLIT_K,
+        batches,
+    )
+
+    _sgmv_shrink_kernel[grid](
+        inputs,
+        lora_a_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        scaling,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_a_weights.stride(0),
+        lora_a_weights.stride(1),
+        lora_a_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+    )
+    return
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
new file mode 100644
index 0000000000000..7c3e27313ad97
--- /dev/null
+++ b/vllm/lora/ops/utils.py
@@ -0,0 +1,46 @@
+import functools
+from typing import Dict
+
+
+@functools.lru_cache
+def _get_op_configs(op_type: str, batch: int, hidden_size: int):
+    # TODO: add optimal configurations
+    return None
+
+
+def _check_divisibility(hidden_size: int):
+    # The bgmv_expand kernel requires that the hidden_size be divisible by
+    # the number below.
+    divisibility = [2, 4, 8, 16, 32, 64]
+    divisibility.sort(reverse=True)
+    for div in divisibility:
+        if hidden_size % div == 0:
+            return div
+    # hidden_size is an odd number
+    return 1
+
+
+def _get_default_config(op_type: str, batch: int, hidden_size: int):
+    if op_type == "expand":
+        return {
+            "BLOCK_N": 256,
+            "SPLIT_N": _check_divisibility(hidden_size),
+            "num_warps": 8
+        }
+    else:
+        return {"BLOCK_K": 256, "SPLIT_K": 64, "num_warps": 8}
+
+
+def get_lora_op_configs(op_type: str, batch: int,
+                        hidden_size: int) -> Dict[str, int]:
+    """Inspired by `fused_moe_kernel`
+    The return value will be a dictionary mapping an irregular grid of batch 
+    sizes and hidden_size to configurations of the bgmv-related kernel. 
+    NOTE: It currently only supports the default configuration. We plan to 
+    generate optimal configurations for different hardware in the future using 
+    scripts similar to `benchmark_moe.py`.
+    """
+    config = _get_op_configs(op_type, batch, hidden_size)
+    if not config:
+        config = _get_default_config(op_type, batch, hidden_size)
+    return config
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 64f87a4b2c69d..6d5c834299961 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -1,207 +1,604 @@
-# Based on code from https://github.com/punica-ai/punica
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
 
-from typing import Optional
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
 
 import torch
 
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON
 
+if HAS_TRITON:
+    from vllm.lora.ops.bgmv_expand import bgmv_expand
+    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+    from vllm.lora.ops.sgmv_expand import sgmv_expand
+    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
-def _check_punica_support():
-    if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
-        return
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
 
-    if current_platform.get_device_capability() < (8, 0):
-        raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0")
-    else:
-        raise ImportError(
-            "punica LoRA kernels could not be imported. If you built vLLM "
-            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set.")
-
-
-def bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
 
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
-        matrices.
-      indicies: Shape: `[B]`. Indices of the weight matrices.
-      layer_idx: Layer index of the weight matrices.
-      scale: Scaling factor.
+def compute_meta(
+    token_lora_tensor: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, bool]:
+    """
+    Get the information required for the sgmv kernel. With the  features:
+    1. If consecutive requests in the batch use the same LoRA, this function
+    will combine them into a single request, improving sgmv kernel inference
+    performance.
+    2. At the beginning of each prefill stage inference, recalculations are
+    needed based on the input, but only once.
     """
-    _check_punica_support()
 
-    ops.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
+    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+        token_lora_tensor, return_counts=True)
+    cum_result = torch.cumsum(seq_length_tensor, dim=0)
+    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+    b_seq_start_tensor[1:].copy_(cum_result[:-1])
+    max_length = seq_length_tensor.max().item()
 
+    batch_size = lora_indices_tensor.size(0)
+    no_lora = False
+    # -1 means no lora should be applied. Use `no_lora` to determine whether
+    # the current step requires LoRA. If LoRA is not needed, the prefill stage
+    # does not need to launch the triton kernel, which can improve performance
+    if batch_size == 1 and lora_indices_tensor == -1:
+        no_lora = True
+    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+            batch_size, max_length, no_lora)
 
-def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
-                            w_t_all: torch.Tensor, indicies: torch.LongTensor,
-                            layer_idx: int, scale: float, y_offset: int,
-                            y_slice_size: int):
-    """
-    Same as `bgmv` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
 
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
+# TODO see if this can be vectorized
+def convert_mapping(
+    mapping: "LoRAMapping",
+    lora_index_to_id: List[Optional[int]],
+    max_loras: int,
+    vocab_size: int,
+    extra_vocab_size: int,
+    long_lora_context: Optional["LongContextLoRAContext"] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor], List[int]]:
+    """Converts LoRAMapping to index tensors.
 
     Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of
-        all of the transposed LoRA matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
+        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+        max_loras: Maximum number of LoRAs.
+        vocab_size: Model vocab size.
+        extra_vocab_size: Extra vocab size each LoRA can have.
+        long_lora_context: Passed if there are long context lora in a batch.
+
+    Returns:
+        A tuple of tensors:
+            base_indices: Tensor of shape [batch_size] mapping batch rows to
+                LoRA indices.
+            sampler_indices: Tensor of shape [batch_size] mapping requests to
+                LoRA indices for sampler. For generation, this will be the
+                same as base_indicies. For prefill, this will map requests
+                to LoRA indices.
+            sampler_indices_padded: Tensor of shape [batch_size] mapping
+                requests to LoRA indices for sampler with padding.
+                Same as sampler_indicies, but -1 is replaced with
+                max_loras.
+            embeddings_indices: Tensor of shape [2, batch_size] mapping
+                requests to embedding indices. First row is for embeddings
+                added by the LoRAs, second row is for the LoRA.lora_a
+                embeddings.
+            long_lora_indices: Tensor of shape [batch_size] mapping
+                requests to RoPE offsets and rot dims for long LoRAs.
+                None if long context lora doesn't exist.
+            indices_len: List of lengths of the above tensors. It contains
+                (base_indices, sampler_indices, sampler_indices_padded,
+                embeddings_indices, long_lora_indices).
     """
-    _check_punica_support()
-
-    ops.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        x.size(1),
-        y_slice_size,
-        y_offset,
-    )
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    long_lora_offsets: Optional[torch.Tensor] = None
+    if long_lora_context:
+        long_lora_offsets = torch.zeros(len(index_mapping_indices),
+                                        device="cuda",
+                                        dtype=torch.long)
+    prompt_mapping: List[int] = [
+        lora_index_to_id.index(x) if x > 0 else -1
+        for x in mapping.prompt_mapping
+    ]
+    lora_idx = None
+    for i in range(len(index_mapping_indices)):
+        # TODO index can be slow. optimize
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
+        if long_lora_context:
+            assert long_lora_offsets is not None
+            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
+                index_mapping_indices[i], 0)
+            long_lora_offsets[i] = lora_offset
 
+    indices_list: List[Union[List[int], torch.Tensor]] = [
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
+    ]
+    if long_lora_context:
+        assert long_lora_offsets is not None
+        indices_list.append(long_lora_offsets)
+    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         device="cuda",
+                                         dtype=torch.long)
+    embeddings_indices = torch.stack([
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
+    ])
+    embeddings_indices[embeddings_indices == -1] = max_loras - 1
+    base_indices = indices[1]
+    sampler_indices = prompt_mapping_tensor
+    sampler_indices_padded = sampler_indices.clone()
+    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
+    long_lora_indices = None
+    long_lora_indices_len: Optional[int] = None
+    if long_lora_context:
+        long_lora_indices = indices[3]
+        long_lora_indices_len = long_lora_indices.shape[-1]
+    # Contain length of indices tensors. Used to index into each tensor.
+    indices_len = [
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
+    ]
+    if long_lora_indices_len is not None:
+        indices_len.append(long_lora_indices_len)
+    else:
+        # If long_lora doesn't exist,append None
+        indices_len.append(None)
 
-def add_lora(y: torch.Tensor,
-             x: torch.Tensor,
-             wa_t_all: torch.Tensor,
-             wb_t_all: torch.Tensor,
-             indicies: torch.LongTensor,
-             layer_idx: int,
-             scale: float,
-             *,
-             buffer: Optional[torch.Tensor] = None):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        long_lora_indices,
+        indices_len,
+    )
 
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      buffer: Optional. Shape: `[B, R]`. Temporary buffer.
+
+class PunicaWrapper:
     """
-    _check_punica_support()
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
-    ops.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-    ops.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, scale)
-
-
-def add_lora_slice(y: torch.Tensor,
-                   x: torch.Tensor,
-                   wa_t_all: torch.Tensor,
-                   wb_t_all: torch.Tensor,
-                   indicies: torch.LongTensor,
-                   layer_idx: int,
-                   scale: float,
-                   y_offset: int,
-                   y_slice_size: int,
-                   *,
-                   buffer: Optional[torch.Tensor] = None):
+    PunicaWrapper is designed to manage and provide metadata for the punica 
+    kernel. The main function  is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica kernel.
     """
-    Same as `add_lora` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
 
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: str):
+        self._token_lora_indices = torch.empty(max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._sampler_indices = torch.empty(max_num_batched_tokens,
+                                            dtype=torch.long,
+                                            device=device)
+        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self._embeddings_indices = torch.empty(2,
+                                               max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._long_lora_indices = torch.empty(max_num_batched_tokens,
+                                              dtype=torch.long,
+                                              device=device)
 
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
-    """
-    _check_punica_support()
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
-    ops.dispatch_bgmv_low_level(
-        buffer,
-        x,
-        wa_t_all,
-        indicies,
-        layer_idx,
-        1.0,
-        x.size(1),
-        buffer.size(1),
-        0,
-    )
-    ops.dispatch_bgmv_low_level(
-        y,
-        buffer,
-        wb_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        buffer.size(1),
-        y_slice_size,
-        y_offset,
-    )
+        # 5 is the number of indicies tensors.
+        # base_indices, sampler_indices, sampler_indices_padded,
+        # embeddings_indices,long_lora_indices
+        self.indices_len: List[Optional[int]] = [None] * 5
+        # these attributes are the information required for sgmv kernel
+        self._seq_start_locs = torch.empty(max_batches,
+                                           dtype=torch.long,
+                                           device=device)
+        self._seq_lengths = torch.empty(max_batches,
+                                        dtype=torch.long,
+                                        device=device)
+        self._lora_indices_per_batch = torch.empty(max_batches,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self.max_length: int = 0
+        self.batch_size: int = -1
+        self.is_prefill = False
+        self.no_lora = False
+
+    def update_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size,
+                                   long_lora_context)
+        if mapping.is_prefill:
+            # Update metadata required for prefill-related operators.
+            self._update_prefill_metada(self.token_lora_indices)
+            self.is_prefill = True
+        else:
+            self.is_prefill = False
+
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            long_lora_context,
+        )
+        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded)
+        self._embeddings_indices[:embeddings_indices.
+                                 shape[0], :embeddings_indices.shape[1]].copy_(
+                                     embeddings_indices)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
+                long_lora_offsets_tensor)
+        else:
+            self._long_lora_indices.zero_()
+
+        self.indices_len[:] = indices_len
+
+    def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
+
+        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+         batch_size, max_length, no_lora) = compute_meta(token_lora_tensor)
+
+        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
+            b_seq_start_tensor)
+        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
+        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
+            lora_indices_tensor)
+        self.batch_size = batch_size
+        self.max_length = max_length
+        self.no_lora = no_lora
+
+    @property
+    def prefill_metadata(
+            self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]:
+        """
+        This property provides a convenient way to access the necessary 
+        metadata for prefill-related  kernel computations.
+            1. seq_start_locs: Tensor of sequence start positions
+            2. seq_lengths: Tensor of sequence lengths
+            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
+                -1 means no lora should be applied.
+            4. batch_size: batch size after clustering identical lora indices
+            5. max_length: The maximum sequence length in the batch
+        """
+        return (self._seq_start_locs[:self.batch_size],
+                self._seq_lengths[:self.batch_size],
+                self._lora_indices_per_batch[:self.batch_size],
+                self.batch_size, self.max_length)
+
+    @property
+    def token_lora_indices(self) -> torch.Tensor:
+        """
+        This property provides the lora indices corresponding to each token 
+        in the batch. An index of -1 means no lora should be applied.
+        """
+        token_lora_len = self.indices_len[0]
+        return self._token_lora_indices[:token_lora_len]
+
+    @property
+    def sampler_indices(self) -> torch.Tensor:
+        """ 
+        This property is used to access the lora indices specifically for 
+        LogitsProcessorWithLoRA
+        """
+        sampler_indices_len = self.indices_len[1]
+        return self._sampler_indices[:sampler_indices_len]
+
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices
+        """
+        indices_padded_len = self.indices_len[2]
+        return self._sampler_indices_padded[:indices_padded_len]
+
+    @property
+    def embeddings_indices(self) -> torch.Tensor:
+        """
+        This property provides access to the indices used for lora embeddings, 
+        specifically for VocabParallelEmbeddingWithLoRA
+        """
+        embeddings_indices_len = self.indices_len[3]
+        return self._embeddings_indices[:, :embeddings_indices_len]
+
+    @property
+    def long_lora_indices(self) -> torch.Tensor:
+        """ 
+        This property provides access to the indices used for long context 
+        lora, specifically for LinearScalingRotaryEmbeddingWithLora
+        """
+        long_lora_len = self.indices_len[4]
+        return self._long_lora_indices[:long_lora_len]
+
+    def shrink_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            scale,
+        )
+
+    def shrink_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
+
+    def expand_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            add_input,
+        )
+
+    def expand_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool,
+    ):
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
+
+    def expand_slice_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            y_offset,
+            y_slice_size,
+            add_input,
+        )
+
+    def expand_slice_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool,
+    ):
+        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
+                          y_slice_size, add_input)
+
+    def add_shrink(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the shrink_decode function
+        should be called.
+        """
+        shrink_fun: Callable = (self.shrink_prefill
+                                if self.is_prefill else self.shrink_decode)
+        shrink_fun(y, x, w_t_all, scale)
+
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool = True,
+    ):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'b.
+        When `is_prefill` is true, it indicates that it is currently the
+        prefill stage, and the `expand_prefill` function should be called.
+        Otherwise, it is the decode stage, and the expand_decode function
+        should be called.
+        """
+
+        expand_fun: Callable = (self.expand_prefill
+                                if self.is_prefill else self.expand_decode)
+        expand_fun(y, x, w_t_all, add_input)
+
+    def add_expand_slice(self,
+                         y: torch.Tensor,
+                         x: torch.Tensor,
+                         w_t_all: torch.Tensor,
+                         y_offset: Optional[int],
+                         y_slice_size: Optional[int],
+                         add_input: bool = True):
+        """
+        Similar to `add_expand`
+        """
+
+        expand_slice_fun: Callable = (self.expand_slice_prefill
+                                      if self.is_prefill else
+                                      self.expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+
+    def add_lora(self,
+                 y: torch.Tensor,
+                 x: torch.Tensor,
+                 wa_t_all: torch.Tensor,
+                 wb_t_all: torch.Tensor,
+                 scale: float,
+                 y_offset: Optional[int] = None,
+                 y_slice_size: Optional[int] = None,
+                 *,
+                 buffer: Optional[torch.Tensor] = None) -> None:
+        """
+        Semantics:
+        y[i] += (
+            x[i].unsqueeze(0)
+            @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+            @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+            * scale
+            ).squeeze(0)
+        Args:
+            y (torch.Tensor):  Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            wa_t_all (torch.Tensor): lora_a's weight
+            wb_t_all (torch.Tensor): lora_b's weight
+            scale (float): Scaling factor.
+            y_offset (Optional[int], optional): Offset to apply to the starting
+                column of y.
+            y_slice_size (Optional[int], optional): Size of the y column slice..
+            buffer (Optional[torch.Tensor], optional): Defaults to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = wb_t_all.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+
+        self.add_shrink(buffer, x, wa_t_all, scale)
+        if y_offset is None and y_slice_size is None:
+            self.add_expand(y, buffer, wb_t_all, add_input=True)
+        else:
+            self.add_expand_slice(y,
+                                  buffer,
+                                  wb_t_all,
+                                  y_offset,
+                                  y_slice_size,
+                                  add_input=True)
+        y = y.view_as(y_org)
+
+    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
+                               lora_a_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               lora_b_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               scale: float,
+                               output_slices: Tuple[int, ...]) -> None:
+        """
+        Applies lora to each input. Similar to add_lora, This method is 
+        used for layers that are composed of multiple sublayers
+        (slices) packed together.
+        """
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+        # TODO fuse these kernels
+        for slice_idx in range(len(output_slices)):
+            self.add_lora(y, x, lora_a_stacked[slice_idx],
+                          lora_b_stacked[slice_idx], scale, offset_left,
+                          output_slices[slice_idx])
+            offset_left += output_slices[slice_idx]
+
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        wa_t_all: torch.Tensor,
+                        wb_t_all: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None) -> None:
+        """
+        LogitsProcessorWithLoRA always using bgmv
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = wb_t_all.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+
+        bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True)
+        y = y.view_as(y_org)
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 568185383aa5c..3f57c22e1f2e4 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -6,5 +6,6 @@
 
     from vllm.triton_utils.custom_cache_manager import (
         maybe_set_triton_cache_manager)
+    from vllm.triton_utils.libentry import libentry
 
-    __all__ += ["maybe_set_triton_cache_manager"]
+    __all__ += ["maybe_set_triton_cache_manager", "libentry"]
diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
new file mode 100644
index 0000000000000..ae00af44a048a
--- /dev/null
+++ b/vllm/triton_utils/libentry.py
@@ -0,0 +1,167 @@
+# Copied From https://github.com/FlagOpen/FlagGems
+
+import inspect
+
+import triton
+
+
+class LibEntry(triton.KernelInterface):
+
+    def __init__(
+        self,
+        fn,
+    ):
+        self.fn = fn
+        self.arg_names = fn.arg_names
+        self.divisibility = 16
+        self.kernel_cache = dict()
+        fn = self.fn
+        while not isinstance(fn, triton.runtime.JITFunction):
+            fn = fn.fn
+        self.jit_function: triton.runtime.JITFunction = fn
+        self.specialize_indices = [
+            p.num for p in self.jit_function.params
+            if not p.is_constexpr and not p.do_not_specialize
+        ]
+        self.do_not_specialize_indices = [
+            p.num for p in self.jit_function.params
+            if not p.is_constexpr and p.do_not_specialize
+        ]
+
+    def key(self, spec_args, dns_args, const_args):
+        spec_key = [(arg.dtype, arg.data_ptr() %
+                     self.divisibility == 0) if hasattr(arg, "data_ptr") else
+                    (type(arg), arg) for arg in spec_args]
+        dns_key = [
+            arg.dtype if hasattr(
+                arg, "data_ptr") else type(arg) if not isinstance(arg, int)
+            else "i32" if -(2**31) <= arg and arg <= 2**31 -
+            1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64"
+            for arg in dns_args
+        ]
+        # const args passed by position
+        return tuple(spec_key + dns_key + const_args)
+
+    def run(self, *args, **kwargs):
+        grid = kwargs["grid"]
+        # collect all the arguments
+        spec_args = []  # specialize arguments
+        dns_args = []  # do not specialize arguments
+        const_args = []  # constexpr arguments
+        k_args = []  # kernel arguments
+        for i, arg in enumerate(args):
+            if i in self.specialize_indices:
+                k_args.append(arg)
+                spec_args.append(arg)
+            elif i in self.do_not_specialize_indices:
+                k_args.append(arg)
+                dns_args.append(arg)
+            else:
+                const_args.append(arg)
+        for p in self.jit_function.params[len(args):]:
+            if p.name in kwargs:
+                val = kwargs[p.name]
+            elif p.default is inspect._empty:
+                continue
+            else:
+                val = p.default
+
+            if p.is_constexpr:
+                const_args.append(val)
+            elif p.do_not_specialize:
+                dns_args.append(val)
+                k_args.append(val)
+            else:
+                spec_args.append(val)
+                k_args.append(val)
+
+        entry_key = self.key(spec_args, dns_args, const_args)
+
+        if entry_key not in self.kernel_cache:
+            # compile the kernel also completes the related computations
+            kernel = self.fn.run(*args, **kwargs)
+            fn = self.fn
+            # collect constexpr arguments for grid computation
+            constexprs = {}
+            while not isinstance(fn, triton.runtime.JITFunction):
+                if isinstance(fn, triton.runtime.Autotuner):
+                    config = fn.best_config
+                    constexprs["num_warps"] = config.num_warps
+                    constexprs["num_stages"] = config.num_stages
+                    constexprs["num_ctas"] = config.num_ctas
+                    constexprs = {**constexprs, **config.kwargs}
+                elif isinstance(fn, triton.runtime.Heuristics):
+                    for v, heur in fn.values.items():
+                        constexprs[v] = heur({
+                            **dict(zip(fn.arg_names, args)),
+                            **kwargs,
+                            **constexprs,
+                        })
+                else:
+                    raise RuntimeError("Invalid Runtime Function")
+                fn = fn.fn
+            # In vLLM, certain kernels like fused_moe_kernel get the
+            # best_config(as kwargs) from a configuration json file, rather
+            # than using Autotuner & Heuristics. Therefore, all their constexprs
+            # (tl.constexpr) are assigned values through the following loop.
+            for p in self.jit_function.params:
+                if p.is_constexpr and p.name not in constexprs:
+                    constexprs[p.name] = p.default  #default=inspect._empty
+            self.kernel_cache[entry_key] = (kernel, constexprs)
+        else:
+            # load kernel from cache directly
+            kernel, constexprs = self.kernel_cache[entry_key]
+
+            if callable(grid):
+                # collect all arguments to the grid fn，ie:
+                # 1. args,
+                # 2. kwargs,
+                # 3. all all other captured arguments in CompiledKernel from
+                # Autotunner & Heuristics when kwargs & captured args conflict,
+                # captured args have higher priority
+                # 4. We must filter out captured args with default value firstly
+                constexprs = {
+                    k: v
+                    for k, v in constexprs.items() if v is not inspect._empty
+                }
+                meta = {
+                    **dict(zip(self.arg_names, args)),
+                    **kwargs,
+                    **constexprs,
+                }
+                grid = grid(meta)
+            if isinstance(grid, tuple):
+                grid = grid + (1, 1)
+            elif isinstance(grid, list):
+                grid = grid + [1, 1]
+            kernel[grid[0:3]](*k_args)
+        # maintaining the same return type as the JITFunction.run
+        return kernel
+
+
+def libentry():
+    """
+    Decorator for triton library entries.
+    Motivation:
+        The runtime overhead of Triton kernels is the reason for the lower 
+        performance of small kernels, particularly evident with smaller models. 
+        Using this decorator can reduce Triton runtime overhead.
+    How:
+        The `run` function of JITFunction needs to accomplish:
+            - Parameter binding using inspect
+            - KernelArg type wrapping
+            - Cache key calculation
+        When dealing with small size, these steps can become bottlenecks in 
+        Triton runtime. Libentry simplifies these steps to reduce runtime 
+        overhead, thereby improving the runtime expenses of small kernels.
+    NOTE:
+        When Triton is upgraded to version 3.0.0, libentry can be removed,
+        see: https://github.com/vllm-project/vllm/pull/5036#issuecomment-2243396245
+        
+
+    """
+
+    def decorator(fn):
+        return LibEntry(fn)
+
+    return decorator
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index de999b11d91b5..7773442899585 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -578,9 +578,9 @@ def build(self) -> ModelInputForGPU:
                 for inter_data in self.inter_data_list
             ])
             lora_mapping = LoRAMapping(
-                lora_index_mapping,
-                lora_prompt_mapping,
-            )
+                **dict(index_mapping=lora_index_mapping,
+                       prompt_mapping=lora_prompt_mapping,
+                       is_prefill=not self.decode_only))
 
         # Prompt adapter data.
         prompt_adapter_requests: Set[PromptAdapterRequest] = set()
@@ -1152,9 +1152,9 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
 
                     if self.lora_config:
                         lora_mapping = LoRAMapping(
-                            [0] * batch_size,
-                            [0] * batch_size,
-                        )
+                            **dict(index_mapping=[0] * batch_size,
+                                   prompt_mapping=[0] * batch_size,
+                                   is_prefill=False))
                         self.set_active_loras(set(), lora_mapping)
 
                     if self.prompt_adapter_config:

From 1d2e7fb73f1205ae03e4ee3bcd3de566733bf582 Mon Sep 17 00:00:00 2001
From: xuyi <xuyi@me.com>
Date: Thu, 1 Aug 2024 09:49:51 +0800
Subject: [PATCH 114/167] [Model] Pipeline parallel support for Qwen2 (#6924)

---
 vllm/config.py                          |  2 +
 vllm/model_executor/models/qwen2.py     | 57 +++++++++++++++-----
 vllm/model_executor/models/qwen2_moe.py | 69 +++++++++++++++++++------
 3 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index de5d0402a1bc7..e065744592378 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -40,6 +40,8 @@
     "GPT2LMHeadModel",
     "MixtralForCausalLM",
     "NemotronForCausalLM",
+    "Qwen2ForCausalLM",
+    "Qwen2MoeForCausalLM",
 ]
 
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 35fd6f37589a0..99fdd993943be 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -30,7 +30,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -49,6 +49,7 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
+from .utils import is_pp_missing_parameter, make_layers
 
 
 class Qwen2MLP(nn.Module):
@@ -227,6 +228,7 @@ def __init__(
         config: Qwen2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -237,10 +239,14 @@ def __init__(
             config.vocab_size,
             config.hidden_size,
         )
-        self.layers = nn.ModuleList([
-            Qwen2DecoderLayer(config, cache_config, quant_config)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen2DecoderLayer(config=config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config),
+            prefix=f"{prefix}.layers",
+        )
+
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -255,20 +261,30 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+            residual = None
         else:
-            hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i],
+                kv_caches[i - self.start_layer],
                 attn_metadata,
                 residual,
             )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -351,6 +367,20 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def sample(
         self,
         logits: torch.Tensor,
@@ -381,6 +411,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -393,7 +425,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 name = maybe_remap_kv_scale_name(name, params_dict)
                 if name is None:
                     continue
-
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 2cc2f1440d147..b895788206d10 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -31,7 +31,8 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import (get_tensor_model_parallel_world_size,
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -52,6 +53,8 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
+from .utils import is_pp_missing_parameter, make_layers
+
 
 class Qwen2MoeMLP(nn.Module):
 
@@ -315,6 +318,7 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
@@ -324,13 +328,15 @@ def __init__(
             config.vocab_size,
             config.hidden_size,
         )
-        self.layers = nn.ModuleList([
-            Qwen2MoeDecoderLayer(config,
-                                 layer_idx,
-                                 cache_config,
-                                 quant_config=quant_config)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen2MoeDecoderLayer(config=config,
+                                                layer_idx=int(
+                                                    prefix.split(".")[-1]),
+                                                cache_config=cache_config,
+                                                quant_config=quant_config),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
@@ -339,14 +345,25 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+        if get_pp_group().is_first_rank:
+            hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i], attn_metadata,
-                                            residual)
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -380,7 +397,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -389,6 +406,20 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def sample(
         self,
         logits: Optional[torch.Tensor],
@@ -435,6 +466,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
                 if name not in params_dict:
                     continue
 
@@ -448,6 +482,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
@@ -460,6 +497,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     # Remapping the name of FP8 kv-scale.
                     if name.endswith("kv_scale"):
                         remapped_kv_scale_name = name.replace(
@@ -474,7 +514,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                             continue
                         else:
                             name = remapped_kv_scale_name
-
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)

From 23993a7997ff927decaca60281871d5fdab11334 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 31 Jul 2024 18:50:28 -0700
Subject: [PATCH 115/167] [Bugfix][TPU] Do not use torch.Generator for TPUs
 (#6981)

---
 vllm/model_executor/model_loader/weight_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 942215da01af4..5e142e8cb8b8a 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -22,6 +22,7 @@
 from vllm.model_executor.layers.quantization import (QuantizationConfig,
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
+from vllm.platforms import current_platform
 from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
@@ -490,6 +491,11 @@ def initialize_dummy_weights(
     """
     for param in model.state_dict().values():
         if torch.is_floating_point(param):
+            if current_platform.is_tpu():
+                # XLA device does not support torch.Generator()
+                param.uniform_(low, high)
+                continue
+
             generator = torch.Generator(device=param.data.device)
             generator.manual_seed(seed)
             if torch.finfo(param.data.dtype).bits < 16:

From 630dd9e0aea166085a4c897e21a98ec752954265 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 31 Jul 2024 20:49:11 -0600
Subject: [PATCH 116/167] [Bugfix][Model] Skip loading lm_head weights if using
 tie_word_embeddings (#6758)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/model_executor/models/chameleon.py | 7 +++++++
 vllm/model_executor/models/llama.py     | 5 +++++
 vllm/model_executor/models/minicpm.py   | 6 +++++-
 vllm/model_executor/models/olmo.py      | 5 +++++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 7659f598bab94..10a82207d90ef 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -998,6 +998,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
             use_default_weight_loading = False
             if "vqmodel" in name:
                 if self.model.vqmodel is not None:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2052c443a8885..048c292c79c83 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -469,6 +469,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
             if scale_name := get_compressed_tensors_cache_scale(name):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index b46e88f5fc584..7f8f38fe8439a 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -514,7 +514,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 408c0c883a9d0..1a0a3774dc8fb 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -343,6 +343,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue

From 0437492ea97f0650a8b2ca39121be8864625fd70 Mon Sep 17 00:00:00 2001
From: Aurick Qiao <aurickq@users.noreply.github.com>
Date: Wed, 31 Jul 2024 20:15:42 -0700
Subject: [PATCH 117/167] PP comm optimization: replace send with partial send
 + allgather (#6695)

Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
---
 vllm/distributed/parallel_state.py | 38 ++++++++++++++++++++++++++++--
 vllm/worker/worker_base.py         |  8 ++++---
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index bf7a7de0724af..d7ca8fd82e1a2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -569,7 +569,8 @@ def broadcast_tensor_dict(
     def send_tensor_dict(
         self,
         tensor_dict: Dict[str, Union[torch.Tensor, Any]],
-        dst: Optional[int] = None
+        dst: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
     ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Send the input tensor dictionary.
         NOTE: `dst` is the local rank of the source rank.
@@ -578,6 +579,11 @@ def send_tensor_dict(
         if not torch.distributed.is_initialized() or self.world_size == 1:
             return tensor_dict
 
+        all_gather_size = (1 if all_gather_group is None else
+                           all_gather_group.world_size)
+        all_gather_rank = (0 if all_gather_group is None else
+                           all_gather_group.rank_in_group)
+
         group = self.device_group
         metadata_group = self.cpu_group
 
@@ -598,6 +604,12 @@ def send_tensor_dict(
             if tensor.numel() == 0:
                 # Skip sending empty tensors.
                 continue
+
+            # send-allgather: send only a slice, then do allgather.
+            if (all_gather_group is not None
+                    and tensor.numel() % all_gather_size == 0):
+                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+
             if tensor.is_cpu:
                 # use metadata_group for CPU tensors
                 torch.distributed.send(tensor,
@@ -612,7 +624,8 @@ def send_tensor_dict(
 
     def recv_tensor_dict(
         self,
-        src: Optional[int] = None
+        src: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
     ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Recv the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
@@ -621,6 +634,11 @@ def recv_tensor_dict(
         if not torch.distributed.is_initialized() or self.world_size == 1:
             return None
 
+        all_gather_size = (1 if all_gather_group is None else
+                           all_gather_group.world_size)
+        all_gather_rank = (0 if all_gather_group is None else
+                           all_gather_group.rank_in_group)
+
         group = self.device_group
         metadata_group = self.cpu_group
 
@@ -639,6 +657,16 @@ def recv_tensor_dict(
                     # Skip broadcasting empty tensors.
                     tensor_dict[key] = tensor
                     continue
+
+                # send-allgather: send only a slice, then do allgather.
+                use_all_gather = (all_gather_group is not None
+                                  and tensor.numel() % all_gather_size == 0)
+
+                if use_all_gather:
+                    orig_shape = tensor.shape
+                    tensor = tensor.reshape(all_gather_size,
+                                            -1)[all_gather_rank]
+
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
                     torch.distributed.recv(tensor,
@@ -649,6 +677,12 @@ def recv_tensor_dict(
                     torch.distributed.recv(tensor,
                                            src=self.ranks[src],
                                            group=group)
+                if use_all_gather:
+                    # do the allgather
+                    tensor = all_gather_group.all_gather(  # type: ignore
+                        tensor, dim=0)
+                    tensor = tensor.reshape(orig_shape)
+
                 tensor_dict[key] = tensor
             else:
                 tensor_dict[key] = value
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 03e3857e23c4b..8a4d1958c65a0 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.distributed import broadcast_tensor_dict, get_pp_group
+from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
@@ -267,7 +267,8 @@ def execute_model(
         intermediate_tensors = None
         if not get_pp_group().is_first_rank:
             intermediate_tensors = IntermediateTensors(
-                get_pp_group().recv_tensor_dict())
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
 
         output = self.model_runner.execute_model(
             model_input, self.kv_cache[worker_input.virtual_engine]
@@ -276,7 +277,8 @@ def execute_model(
 
         if not get_pp_group().is_last_rank:
             # output is IntermediateTensors
-            get_pp_group().send_tensor_dict(output.tensors)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
             return [None]
 
         # output is List[SamplerOutput]

From 3c10591ef2e78fbb6aa341195c4b24c36ae8b84d Mon Sep 17 00:00:00 2001
From: zifeitong <zifei.tong@parasail.io>
Date: Wed, 31 Jul 2024 21:13:34 -0700
Subject: [PATCH 118/167] [Bugfix] Set SamplingParams.max_tokens for OpenAI
 requests if not provided by user (#6954)

---
 tests/entrypoints/openai/test_serving_chat.py | 39 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 30 ++++++++++----
 vllm/entrypoints/openai/serving_chat.py       | 23 ++++-------
 vllm/entrypoints/openai/serving_completion.py | 27 +++++--------
 vllm/entrypoints/openai/serving_engine.py     | 17 ++++++--
 5 files changed, 92 insertions(+), 44 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 464465494b714..168ba7ba888ef 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,7 +1,12 @@
 import asyncio
+from contextlib import suppress
 from dataclasses import dataclass
+from unittest.mock import MagicMock
 
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
@@ -42,3 +47,37 @@ async def _async_serving_chat_init():
 def test_async_serving_chat_init():
     serving_completion = asyncio.run(_async_serving_chat_init())
     assert serving_completion.chat_template == CHAT_TEMPLATE
+
+
+def test_serving_chat_should_set_correct_max_tokens():
+    mock_engine = MagicMock(spec=AsyncLLMEngine)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     served_model_names=[MODEL_NAME],
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     lora_modules=None,
+                                     prompt_adapters=None,
+                                     request_logger=None)
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    # AsyncLLMEngine.generate(inputs, sampling_params, ...)
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    req.max_tokens = 10
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 205860aa8e722..3b35ae1ebd705 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -11,7 +11,7 @@
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import LogitsProcessor, SamplingParams
 from vllm.utils import random_uuid
 
 
@@ -215,15 +215,22 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_sampling_params(self,
-                           tokenizer: PreTrainedTokenizer) -> SamplingParams:
-        # We now allow logprobs being true without top_logrobs.
+    def to_sampling_params(
+            self, tokenizer: PreTrainedTokenizer,
+            guided_decode_logits_processor: Optional[LogitsProcessor],
+            default_max_tokens: int) -> SamplingParams:
+        max_tokens = self.max_tokens
+        if max_tokens is None:
+            max_tokens = default_max_tokens
 
+        # We now allow logprobs being true without top_logrobs.
         logits_processors = get_logits_processors(
             logit_bias=self.logit_bias,
             allowed_token_ids=None,
             tokenizer=tokenizer,
         )
+        if guided_decode_logits_processor:
+            logits_processors.append(guided_decode_logits_processor)
 
         return SamplingParams(
             n=self.n,
@@ -241,7 +248,7 @@ def to_sampling_params(self,
             logprobs=self.top_logprobs if self.logprobs else None,
             prompt_logprobs=self.top_logprobs if self.echo else None,
             ignore_eos=self.ignore_eos,
-            max_tokens=self.max_tokens,
+            max_tokens=max_tokens,
             min_tokens=self.min_tokens,
             use_beam_search=self.use_beam_search,
             early_stopping=self.early_stopping,
@@ -395,7 +402,14 @@ class CompletionRequest(OpenAIBaseModel):
 
     # doc: end-completion-extra-params
 
-    def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
+    def to_sampling_params(
+            self, tokenizer: PreTrainedTokenizer,
+            guided_decode_logits_processor: Optional[LogitsProcessor],
+            default_max_tokens: int) -> SamplingParams:
+        max_tokens = self.max_tokens
+        if max_tokens is None:
+            max_tokens = default_max_tokens
+
         echo_without_generation = self.echo and self.max_tokens == 0
 
         logits_processors = get_logits_processors(
@@ -403,6 +417,8 @@ def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
             allowed_token_ids=self.allowed_token_ids,
             tokenizer=tokenizer,
         )
+        if guided_decode_logits_processor:
+            logits_processors.append(guided_decode_logits_processor)
 
         return SamplingParams(
             n=self.n,
@@ -419,7 +435,7 @@ def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
             stop_token_ids=self.stop_token_ids,
             logprobs=self.logprobs,
             ignore_eos=self.ignore_eos,
-            max_tokens=self.max_tokens if not echo_without_generation else 1,
+            max_tokens=max_tokens if not echo_without_generation else 1,
             min_tokens=self.min_tokens,
             use_beam_search=self.use_beam_search,
             early_stopping=self.early_stopping,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 01843930bf11d..c832cf2a24b50 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -25,8 +25,6 @@
                                                     PromptAdapterPath)
 from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
 from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
@@ -134,28 +132,23 @@ async def create_chat_completion(
 
         request_id = f"chat-{random_uuid()}"
         try:
-            sampling_params = request.to_sampling_params(tokenizer)
-            decoding_config = await self.engine.get_decoding_config()
-            guided_decoding_backend = request.guided_decoding_backend \
-                or decoding_config.guided_decoding_backend
             guided_decode_logits_processor = (
-                await
-                get_guided_decoding_logits_processor(guided_decoding_backend,
-                                                     request, tokenizer))
-            if guided_decode_logits_processor:
-                if sampling_params.logits_processors is None:
-                    sampling_params.logits_processors = []
-                sampling_params.logits_processors.append(
-                    guided_decode_logits_processor)
+                await self._guided_decode_logits_processor(request, tokenizer))
 
             prompt_inputs = self._tokenize_prompt_input(
                 request,
                 tokenizer,
                 prompt,
-                truncate_prompt_tokens=sampling_params.truncate_prompt_tokens,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
 
+            sampling_params = request.to_sampling_params(
+                tokenizer,
+                guided_decode_logits_processor,
+                default_max_tokens=self.max_model_len -
+                len(prompt_inputs["prompt_token_ids"]))
+
             self._log_inputs(request_id,
                              prompt_inputs,
                              params=sampling_params,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 8548352791680..7765c5903f341 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -24,8 +24,6 @@
                                                     OpenAIServing,
                                                     PromptAdapterPath)
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -95,31 +93,24 @@ async def create_completion(self, request: CompletionRequest,
 
             tokenizer = await self.engine.get_tokenizer(lora_request)
 
-            sampling_params = request.to_sampling_params(tokenizer)
-            decoding_config = await self.engine.get_decoding_config()
-            guided_decoding_backend = request.guided_decoding_backend \
-                or decoding_config.guided_decoding_backend
-            guided_decode_logit_processor = (
-                await
-                get_guided_decoding_logits_processor(guided_decoding_backend,
-                                                     request, tokenizer))
-            if guided_decode_logit_processor is not None:
-                if sampling_params.logits_processors is None:
-                    sampling_params.logits_processors = []
-                sampling_params.logits_processors.append(
-                    guided_decode_logit_processor)
-
+            guided_decode_logits_processor = (
+                await self._guided_decode_logits_processor(request, tokenizer))
             prompts = list(
                 self._tokenize_prompt_input_or_inputs(
                     request,
                     tokenizer,
                     request.prompt,
-                    truncate_prompt_tokens=sampling_params.
-                    truncate_prompt_tokens,
+                    truncate_prompt_tokens=request.truncate_prompt_tokens,
                     add_special_tokens=request.add_special_tokens,
                 ))
 
             for i, prompt_inputs in enumerate(prompts):
+                sampling_params = request.to_sampling_params(
+                    tokenizer,
+                    guided_decode_logits_processor,
+                    default_max_tokens=self.max_model_len -
+                    len(prompt_inputs["prompt_token_ids"]))
+
                 request_id_item = f"{request_id}-{i}"
 
                 self._log_inputs(request_id_item,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index b374a7946b11e..8c7929a12e9a0 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -25,9 +25,11 @@
 from vllm.inputs import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import LogitsProcessor, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer_group import AnyTokenizer
 
@@ -150,6 +152,15 @@ def create_streaming_error_response(
         })
         return json_str
 
+    async def _guided_decode_logits_processor(
+            self, request: Union[ChatCompletionRequest, CompletionRequest],
+            tokenizer: AnyTokenizer) -> Optional[LogitsProcessor]:
+        decoding_config = await self.engine.get_decoding_config()
+        guided_decoding_backend = request.guided_decoding_backend \
+            or decoding_config.guided_decoding_backend
+        return await get_guided_decoding_logits_processor(
+            guided_decoding_backend, request, tokenizer)
+
     async def _check_model(
         self,
         request: AnyRequest,
@@ -254,9 +265,7 @@ def _validate_input(
                     f"{self.max_model_len} tokens. However, you requested "
                     f"{token_num} tokens in the messages, "
                     f"Please reduce the length of the messages.")
-            request.max_tokens = self.max_model_len - token_num
-
-        if token_num + request.max_tokens > self.max_model_len:
+        elif token_num + request.max_tokens > self.max_model_len:
             raise ValueError(
                 f"This model's maximum context length is "
                 f"{self.max_model_len} tokens. However, you requested "

From c8a7e93273ff4338d6f89f8a63ff16426ac240b8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 31 Jul 2024 23:51:09 -0700
Subject: [PATCH 119/167] [core][scheduler] simplify and improve scheduler
 (#6867)

---
 tests/core/block/e2e/test_correctness.py |   2 +-
 tests/core/test_scheduler.py             | 163 ++++++++++-------------
 vllm/core/policy.py                      |  45 -------
 vllm/core/scheduler.py                   | 116 ++++++----------
 4 files changed, 112 insertions(+), 214 deletions(-)
 delete mode 100644 vllm/core/policy.py

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 8502eab0f8da0..e0dee43f500a0 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -183,7 +183,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
 
             # Allow only 2 sequences of ~128 tokens in worst case.
             # Note 16 = 128/block_size
-            "num_gpu_blocks_override": 2 * (16 + 1),
+            "num_gpu_blocks_override": 2 * (16 + 2),
         }
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 4ca2260b5e017..447e8f8a586f6 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,13 +1,12 @@
 import time
 from collections import deque
-from typing import Deque, List, Set, Tuple
+from typing import List, Set, Tuple
 from unittest.mock import MagicMock
 
 import pytest  # noqa
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
-from vllm.core.policy import PolicyFactory
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob, SequenceGroup, SequenceStatus
@@ -348,10 +347,10 @@ def test_prefill_schedule_max_prompt_len():
     """
     scheduler = initialize_scheduler(max_model_len=30)
     _, seq_group = create_dummy_prompt("0", prompt_length=60)
-    waiting = deque([seq_group])
+    scheduler.add_seq_group(seq_group)
     budget = create_token_budget()
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 1
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
@@ -364,15 +363,14 @@ def test_prefill_schedule_token_budget():
     Test token budget respected.
     """
     scheduler = initialize_scheduler()
-    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(token_budget=0)
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
 
     # 0 token budget == nothing is scheduled.
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
@@ -381,8 +379,8 @@ def test_prefill_schedule_token_budget():
 
     # 60 token budget == 1 request scheduled.
     budget = create_token_budget(token_budget=60)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 1
     assert budget.num_batched_tokens == 60
@@ -391,14 +389,13 @@ def test_prefill_schedule_token_budget():
 
     # Test when current_batched_tokens respected.
     scheduler = initialize_scheduler()
-    waiting = deque()
     budget = create_token_budget(token_budget=60)
     add_token_budget(budget, 30, 0)
     _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
     # Cannot schedule a prompt that doesn't fit the budget.
-    waiting.append(seq_group)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 30
@@ -406,8 +403,8 @@ def test_prefill_schedule_token_budget():
     assert len(remaining_waiting) == 1
     budget = create_token_budget(token_budget=90)
     add_token_budget(budget, 30, 0)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.seq_groups) == 1
     assert budget.num_batched_tokens == 90
     assert budget.num_curr_seqs == 1
@@ -419,13 +416,12 @@ def test_prefill_schedule_max_seqs():
     Test max seq respected.
     """
     scheduler = initialize_scheduler()
-    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(max_num_seqs=2)
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+        scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 2
     assert budget.num_batched_tokens == 120
@@ -433,13 +429,13 @@ def test_prefill_schedule_max_seqs():
     assert len(remaining_waiting) == 1
 
     # Verify curr_num_seqs respected.
-    waiting = deque()
+    scheduler.waiting = deque()
     budget = create_token_budget(max_num_seqs=2)
     add_token_budget(budget, 0, 2)
     _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-    waiting.append(seq_group)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
@@ -453,7 +449,6 @@ def test_prefill_schedule_max_lora():
     """
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
     scheduler = initialize_scheduler(lora_config=lora_config)
-    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(token_budget=120)
     curr_loras: Set[int] = set()
     for i in range(2):
@@ -463,7 +458,7 @@ def test_prefill_schedule_max_lora():
                                                lora_name=str(i),
                                                lora_int_id=i + 1,
                                                lora_path="abc"))
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
     # Add two more requests to verify lora is prioritized.
     # 0: Lora, 1: Lora, 2: regular, 3: regular
     # In the first iteration, index 0, 2 is scheduled.
@@ -471,10 +466,10 @@ def test_prefill_schedule_max_lora():
     # prioritized. Verify that.
     for i in range(2, 4):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
     # Schedule 2 requests (0 and 2)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, curr_loras)
+    output = scheduler._schedule_prefills(budget, curr_loras)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 2
     assert budget.num_batched_tokens == 120
@@ -485,8 +480,8 @@ def test_prefill_schedule_max_lora():
     # Reset curr_loras so that it can be scheduled.
     curr_loras = set()
     budget = create_token_budget(token_budget=60)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        remaining_waiting, budget, curr_loras)
+    output = scheduler._schedule_prefills(budget, curr_loras)
+    remaining_waiting = scheduler.waiting
     assert len(output.seq_groups) == 1
     assert output.seq_groups[0].seq_group.request_id == "1"
     assert len(remaining_waiting) == 1
@@ -499,31 +494,29 @@ def test_prefill_schedule_no_block_manager_capacity():
     Test sequence cannot be scheduled due to block manager has no capacity.
     """
     scheduler = initialize_scheduler()
-    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
     scheduler.block_manager.can_allocate = MagicMock()
     scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
-    remainig_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
     assert budget.num_curr_seqs == 0
-    assert len(remainig_waiting) == 3
+    assert len(remaining_waiting) == 3
 
     scheduler = initialize_scheduler()
-    waiting = deque()
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
     scheduler.block_manager.can_allocate = MagicMock()
     scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 3
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
@@ -536,14 +529,12 @@ def test_decode_schedule_preempted():
     Test decodes cannot be scheduled and preempted.
     """
     scheduler = initialize_scheduler()
-    running: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
-        running.append(seq_group)
+        scheduler._add_seq_group_to_running(seq_group)
     scheduler.block_manager.can_append_slots = MagicMock()
 
     def cannot_append_second_group(seq_group, num_lookahead_slots):
@@ -555,8 +546,8 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     # 1 cannot be scheduled, and the lowest priority (request 2)
     # should be preempted. 1 will also be preempted.
     budget = create_token_budget()
-    remainig_running, output = scheduler._schedule_running(
-        running, budget, curr_loras, policy)
+    output = scheduler._schedule_running(budget, curr_loras)
+    remainig_running = scheduler.running
     assert len(remainig_running) == 0
     assert len(output.decode_seq_groups) == 1
     assert len(output.prefill_seq_groups) == 0
@@ -577,14 +568,12 @@ def test_decode_swap_beam_search():
     Test best_of > 1 swap out blocks
     """
     scheduler = initialize_scheduler()
-    running: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
         scheduler._allocate_and_set_running(seq_group)
-        running.append(seq_group)
+        scheduler._add_seq_group_to_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         budget.add_num_seqs(seq_group.request_id,
                             seq_group.get_max_num_running_seqs())
@@ -603,8 +592,8 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     expected_swap_mapping = [("5", "7")]
     scheduler.block_manager.swap_out.return_value = expected_swap_mapping
 
-    remainig_running, output = scheduler._schedule_running(
-        running, budget, curr_loras, policy)
+    output = scheduler._schedule_running(budget, curr_loras)
+    remainig_running = scheduler.running
     assert len(remainig_running) == 0
     assert len(output.decode_seq_groups) == 2
     assert len(output.prefill_seq_groups) == 0
@@ -628,20 +617,18 @@ def test_schedule_decode_blocks_to_copy_update():
     """
     scheduler = initialize_scheduler()
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    running: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
-    running.append(seq_group)
+    scheduler._add_seq_group_to_running(seq_group)
 
     # The last request should be swapped out.
     scheduler.block_manager.append_slots = MagicMock()
     scheduler.block_manager.append_slots.return_value = [(2, 3)]
 
     budget = create_token_budget()
-    remaining_running, output = scheduler._schedule_running(
-        running, budget, curr_loras, policy)
+    output = scheduler._schedule_running(budget, curr_loras)
+    remaining_running = scheduler.running
     assert len(remaining_running) == 0
     assert len(output.decode_seq_groups) == 1
     assert len(output.prefill_seq_groups) == 0
@@ -656,19 +643,17 @@ def test_schedule_decode_blocks_to_copy_update():
 
 def test_schedule_swapped_simple():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
     scheduler._swap_out(seq_group, blocks_to_swap_out)
-    swapped.append(seq_group)
+    scheduler._add_seq_group_to_swapped(seq_group)
 
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 0
     assert budget.num_batched_tokens == 1
     assert budget.num_curr_seqs == 2
@@ -683,8 +668,6 @@ def test_schedule_swapped_simple():
 
 def test_schedule_swapped_max_token_budget():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
@@ -692,11 +675,11 @@ def test_schedule_swapped_max_token_budget():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     budget = create_token_budget(token_budget=1)
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 1
     assert budget.num_batched_tokens == 1
     assert budget.num_curr_seqs == 2
@@ -706,8 +689,8 @@ def test_schedule_swapped_max_token_budget():
     # Verify num_batched_tokens are respected.
     budget = create_token_budget(token_budget=1)
     add_token_budget(budget, 1, 0)
-    remaining_swapped, output = scheduler._schedule_swapped(
-        remaining_swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 1
     assert budget.num_batched_tokens == 1
     assert budget.num_curr_seqs == 0
@@ -717,8 +700,6 @@ def test_schedule_swapped_max_token_budget():
 
 def test_schedule_swapped_max_seqs():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for i in range(4):
@@ -726,11 +707,11 @@ def test_schedule_swapped_max_seqs():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     budget = create_token_budget(max_num_seqs=2)
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 2
     assert budget.num_batched_tokens == 2
     assert budget.num_curr_seqs == 2
@@ -738,8 +719,8 @@ def test_schedule_swapped_max_seqs():
     assert len(output.prefill_seq_groups) == 0
 
     # Verify num_curr_seqs are respected.
-    remaining_swapped, output = scheduler._schedule_swapped(
-        remaining_swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 2
     assert budget.num_batched_tokens == 2
     assert budget.num_curr_seqs == 2
@@ -750,8 +731,6 @@ def test_schedule_swapped_max_seqs():
 def test_schedule_swapped_max_loras():
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
     scheduler = initialize_scheduler(lora_config=lora_config)
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras: Set[int] = set()
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for i in range(2):
@@ -764,11 +743,11 @@ def test_schedule_swapped_max_loras():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 1
     assert budget.num_batched_tokens == 1
     assert budget.num_curr_seqs == 1
@@ -779,8 +758,6 @@ def test_schedule_swapped_max_loras():
 
 def test_schedule_swapped_cannot_swap_in():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
@@ -788,15 +765,15 @@ def test_schedule_swapped_cannot_swap_in():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     # The last request should be swapped out.
     scheduler.block_manager.can_swap_in = MagicMock()
     scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
     # Since we cannot swap in, none of the requests are swapped in.
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 2
     assert budget.num_batched_tokens == 0
     assert budget.num_curr_seqs == 0
@@ -806,8 +783,6 @@ def test_schedule_swapped_cannot_swap_in():
 
 def test_infeasible_swap():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
@@ -815,15 +790,15 @@ def test_infeasible_swap():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     # The last request should be swapped out.
     scheduler.block_manager.can_swap_in = MagicMock()
     scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
     # Since we cannot swap in, none of the requests are swapped in.
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 0
     assert len(output.infeasible_seq_groups) == 2
     assert budget.num_batched_tokens == 0
@@ -834,23 +809,21 @@ def test_infeasible_swap():
 
 def test_schedule_swapped_blocks_to_copy():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
     blocks_to_swap_out: List[Tuple[int, int]] = []
     scheduler._swap_out(seq_group, blocks_to_swap_out)
-    swapped.append(seq_group)
+    scheduler._add_seq_group_to_swapped(seq_group)
 
     # The last request should be swapped out.
     scheduler.block_manager.append_slots = MagicMock()
     scheduler.block_manager.append_slots.return_value = [(2, 3)]
 
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 0
     assert len(output.decode_seq_groups) == 1
     assert len(output.prefill_seq_groups) == 0
diff --git a/vllm/core/policy.py b/vllm/core/policy.py
deleted file mode 100644
index a4463ac0f340e..0000000000000
--- a/vllm/core/policy.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from collections import deque
-from typing import Deque
-
-from vllm.sequence import SequenceGroup
-
-
-class Policy:
-
-    def get_priority(
-        self,
-        now: float,
-        seq_group: SequenceGroup,
-    ) -> float:
-        raise NotImplementedError
-
-    def sort_by_priority(
-        self,
-        now: float,
-        seq_groups: Deque[SequenceGroup],
-    ) -> Deque[SequenceGroup]:
-        return deque(
-            sorted(
-                seq_groups,
-                key=lambda seq_group: self.get_priority(now, seq_group),
-                reverse=True,
-            ))
-
-
-class FCFS(Policy):
-
-    def get_priority(
-        self,
-        now: float,
-        seq_group: SequenceGroup,
-    ) -> float:
-        return now - seq_group.metrics.arrival_time
-
-
-class PolicyFactory:
-
-    _POLICY_REGISTRY = {'fcfs': FCFS}
-
-    @classmethod
-    def get_policy(cls, policy_name: str, **kwargs) -> Policy:
-        return cls._POLICY_REGISTRY[policy_name](**kwargs)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 5cdf1d15c31e1..11d020be0c940 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -8,7 +8,6 @@
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
-from vllm.core.policy import Policy, PolicyFactory
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -345,6 +344,16 @@ def add_seq_group(self, seq_group: SequenceGroup) -> None:
         # Add sequence groups to the waiting queue.
         self.waiting.append(seq_group)
 
+    def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the running queue.
+        # Only for testing purposes.
+        self.running.append(seq_group)
+
+    def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the swapped queue.
+        # Only for testing purposes.
+        self.swapped.append(seq_group)
+
     def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
         """Aborts a sequence group with the given ID.
 
@@ -398,32 +407,26 @@ def get_and_reset_finished_requests_ids(self) -> List[str]:
 
     def _schedule_running(
         self,
-        running_queue: deque,
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
-        policy: Policy,
         enable_chunking: bool = False,
-    ) -> Tuple[deque, SchedulerRunningOutputs]:
+    ) -> SchedulerRunningOutputs:
         """Schedule sequence groups that are running.
 
         Running queue should include decode and chunked prefill requests.
 
         Args:
-            running_queue: The queue that contains running requests (i.e.,
-                decodes). The given arguments are NOT in-place modified.
             budget: The scheduling budget. The argument is in-place updated
                 when any decodes are preempted.
             curr_loras: Currently batched lora request ids. The argument is
                 in-place updated when any decodes are preempted.
-            policy: The sorting policy to sort running_queue.
             enable_chunking: If True, seq group can be chunked and only a
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
     
         Returns:
-            A tuple of remaining running queue (should be always 0) after
-            scheduling and SchedulerRunningOutputs.
+            SchedulerRunningOutputs.
         """
         # Blocks that need to be swapped or copied before model execution.
         blocks_to_swap_out: List[Tuple[int, int]] = []
@@ -436,10 +439,9 @@ def _schedule_running(
 
         # NOTE(woosuk): Preemption happens only when there is no available slot
         # to keep all the sequence groups in the RUNNING state.
-        # In this case, the policy is responsible for deciding which sequence
-        # groups to preempt.
-        now = time.time()
-        running_queue = policy.sort_by_priority(now, running_queue)
+
+        running_queue = self.running
+
         while running_queue:
             seq_group = running_queue[0]
             num_running_tokens = self._get_num_new_tokens(
@@ -503,7 +505,7 @@ def _schedule_running(
                 if curr_loras is not None and seq_group.lora_int_id > 0:
                     curr_loras.add(seq_group.lora_int_id)
 
-        return running_queue, SchedulerRunningOutputs(
+        return SchedulerRunningOutputs(
             decode_seq_groups=decode_seq_groups,
             prefill_seq_groups=prefill_seq_groups,
             preempted=preempted,
@@ -515,12 +517,10 @@ def _schedule_running(
 
     def _schedule_swapped(
         self,
-        swapped_queue: deque,
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
-        policy: Policy,
         enable_chunking: bool = False,
-    ) -> Tuple[deque, SchedulerSwappedInOutputs]:
+    ) -> SchedulerSwappedInOutputs:
         """Schedule sequence groups that are swapped out.
 
         It schedules swapped requests as long as it fits `budget` and
@@ -528,20 +528,16 @@ def _schedule_swapped(
         `budget` and `curr_loras` are updated based on scheduled seq_groups.
 
         Args:
-            swapped_queue: The queue that contains swapped out requests.
-                The given arguments are NOT in-place modified.
             budget: The scheduling budget. The argument is in-place updated
                 when any requests are swapped in.
             curr_loras: Currently batched lora request ids. The argument is
                 in-place updated when any requests are swapped in.
-            policy: The sorting policy to sort swapped_queue.
             enable_chunking: If True, seq group can be chunked and only a
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
 
         Returns:
-            A tuple of remaining swapped_queue after scheduling and
             SchedulerSwappedInOutputs.
         """
         # Blocks that need to be swapped or copied before model execution.
@@ -549,10 +545,10 @@ def _schedule_swapped(
         blocks_to_copy: List[Tuple[int, int]] = []
         decode_seq_groups: List[ScheduledSequenceGroup] = []
         prefill_seq_groups: List[ScheduledSequenceGroup] = []
-        now = time.time()
-        swapped_queue = policy.sort_by_priority(now, swapped_queue)
         infeasible_seq_groups: List[SequenceGroup] = []
 
+        swapped_queue = self.swapped
+
         leftover_swapped: Deque[SequenceGroup] = deque()
         while swapped_queue:
             seq_group = swapped_queue[0]
@@ -617,7 +613,7 @@ def _schedule_swapped(
 
         swapped_queue.extendleft(leftover_swapped)
 
-        return swapped_queue, SchedulerSwappedInOutputs(
+        return SchedulerSwappedInOutputs(
             decode_seq_groups=decode_seq_groups,
             prefill_seq_groups=prefill_seq_groups,
             blocks_to_swap_in=blocks_to_swap_in,
@@ -644,11 +640,10 @@ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
 
     def _schedule_prefills(
         self,
-        waiting_queue: deque,
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
         enable_chunking: bool = False,
-    ) -> Tuple[deque, SchedulerPrefillOutputs]:
+    ) -> SchedulerPrefillOutputs:
         """Schedule sequence groups that are in prefill stage.
 
         Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
@@ -660,8 +655,6 @@ def _schedule_prefills(
         `budget` and `curr_loras` are updated based on scheduled seq_groups.
 
         Args:
-            waiting_queue: The queue that contains prefill requests.
-                The given arguments are NOT in-place modified.
             budget: The scheduling budget. The argument is in-place updated
                 when any requests are scheduled.
             curr_loras: Currently batched lora request ids. The argument is
@@ -672,14 +665,12 @@ def _schedule_prefills(
                 all tokens.
 
         Returns:
-            A tuple of remaining waiting_queue after scheduling and
             SchedulerSwappedInOutputs.
         """
         ignored_seq_groups: List[SequenceGroup] = []
         seq_groups: List[SequenceGroup] = []
-        # We don't sort waiting queue because we assume it is sorted.
-        # Copy the queue so that the input queue is not modified.
-        waiting_queue = deque([s for s in waiting_queue])
+
+        waiting_queue = self.waiting
 
         leftover_waiting_sequences: Deque[SequenceGroup] = deque()
         while self._passed_delay(time.time()) and waiting_queue:
@@ -758,7 +749,7 @@ def _schedule_prefills(
         if len(seq_groups) > 0:
             self.prev_prompt = True
 
-        return waiting_queue, SchedulerPrefillOutputs(
+        return SchedulerPrefillOutputs(
             seq_groups=seq_groups,
             ignored_seq_groups=ignored_seq_groups,
             num_lookahead_slots=self._get_num_lookahead_slots(is_prefill=True))
@@ -785,53 +776,43 @@ def _schedule_default(self) -> SchedulerOutputs:
             seq_group.lora_int_id for seq_group in self.running
             if seq_group.lora_int_id > 0) if self.lora_enabled else None
 
-        remaining_waiting, prefills = (self.waiting,
-                                       SchedulerPrefillOutputs.create_empty())
-        remaining_running, running_scheduled = (
-            self.running, SchedulerRunningOutputs.create_empty())
-        remaining_swapped, swapped_in = (
-            self.swapped, SchedulerSwappedInOutputs.create_empty())
+        prefills = SchedulerPrefillOutputs.create_empty()
+        running_scheduled = SchedulerRunningOutputs.create_empty()
+        swapped_in = SchedulerSwappedInOutputs.create_empty()
 
         # If any requests are swapped, prioritized swapped requests.
         if not self.swapped:
-            remaining_waiting, prefills = self._schedule_prefills(
-                self.waiting, budget, curr_loras, enable_chunking=False)
+            prefills = self._schedule_prefills(budget,
+                                               curr_loras,
+                                               enable_chunking=False)
 
-        fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs")
         # Don't schedule decodes if prefills are scheduled.
         # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
         # only contains decode requests, not chunked prefills.
         if len(prefills.seq_groups) == 0:
-            remaining_running, running_scheduled = self._schedule_running(
-                self.running,
-                budget,
-                curr_loras,
-                fcfs_policy,
-                enable_chunking=False)
+            running_scheduled = self._schedule_running(budget,
+                                                       curr_loras,
+                                                       enable_chunking=False)
 
             # If any sequence group is preempted, do not swap in any sequence
             # group. because it means there's no slot for new running requests.
             if len(running_scheduled.preempted) + len(
                     running_scheduled.swapped_out) == 0:
-                remaining_swapped, swapped_in = self._schedule_swapped(
-                    self.swapped, budget, curr_loras, fcfs_policy)
+                swapped_in = self._schedule_swapped(budget, curr_loras)
 
         assert (budget.num_batched_tokens <=
                 self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
-        self.waiting = remaining_waiting
         self.waiting.extendleft(running_scheduled.preempted)
         # Update new running requests.
-        self.running = remaining_running
         self.running.extend([s.seq_group for s in prefills.seq_groups])
         self.running.extend(
             [s.seq_group for s in running_scheduled.decode_seq_groups])
         self.running.extend(
             [s.seq_group for s in swapped_in.decode_seq_groups])
         # Update swapped requests.
-        self.swapped = remaining_swapped
         self.swapped.extend(running_scheduled.swapped_out)
         preempted = (len(running_scheduled.preempted) +
                      len(running_scheduled.swapped_out))
@@ -877,42 +858,32 @@ def _schedule_chunked_prefill(self):
         )
         curr_loras: Set[int] = set()
 
-        remaining_waiting, prefills = (self.waiting,
-                                       SchedulerPrefillOutputs.create_empty())
-        remaining_running, running_scheduled = (
-            self.running, SchedulerRunningOutputs.create_empty())
-        remaining_swapped, swapped_in = (
-            self.swapped, SchedulerSwappedInOutputs.create_empty())
+        prefills = SchedulerPrefillOutputs.create_empty()
+        swapped_in = SchedulerSwappedInOutputs.create_empty()
 
         # Decoding should be always scheduled first by fcfs.
-        fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs")
-        remaining_running, running_scheduled = self._schedule_running(
-            self.running,
-            budget,
-            curr_loras,
-            fcfs_policy,
-            enable_chunking=True)
+        running_scheduled = self._schedule_running(budget,
+                                                   curr_loras,
+                                                   enable_chunking=True)
 
         # Schedule swapped out requests.
         # If preemption happens, it means we don't have space for swap-in.
         if len(running_scheduled.preempted) + len(
                 running_scheduled.swapped_out) == 0:
-            remaining_swapped, swapped_in = self._schedule_swapped(
-                self.swapped, budget, curr_loras, fcfs_policy)
+            swapped_in = self._schedule_swapped(budget, curr_loras)
 
         # Schedule new prefills.
-        remaining_waiting, prefills = self._schedule_prefills(
-            self.waiting, budget, curr_loras, enable_chunking=True)
+        prefills = self._schedule_prefills(budget,
+                                           curr_loras,
+                                           enable_chunking=True)
 
         assert (budget.num_batched_tokens <=
                 self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
-        self.waiting = remaining_waiting
         self.waiting.extendleft(running_scheduled.preempted)
         # Update new running requests.
-        self.running = remaining_running
         self.running.extend([s.seq_group for s in prefills.seq_groups])
         self.running.extend(
             [s.seq_group for s in running_scheduled.decode_seq_groups])
@@ -923,7 +894,6 @@ def _schedule_chunked_prefill(self):
         self.running.extend(
             [s.seq_group for s in swapped_in.prefill_seq_groups])
         # Update swapped requests.
-        self.swapped = remaining_swapped
         self.swapped.extend(running_scheduled.swapped_out)
         return SchedulerOutputs(
             scheduled_seq_groups=(prefills.seq_groups +

From a72a424b3eac43a26d2214c0f2a7f91cc59f2f84 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 1 Aug 2024 13:07:37 -0500
Subject: [PATCH 120/167] [Build/CI] Fixing Docker Hub quota issue. (#7043)

---
 .buildkite/run-amd-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 77e451354caf6..85b2b6b503531 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -56,7 +56,7 @@ done
 
 echo "--- Pulling container" 
 docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
-image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull ${image_name}
 

From 7e0861bd0bb25ea5ceaa3a513da4133fb828b5fe Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 1 Aug 2024 11:11:24 -0700
Subject: [PATCH 121/167] [CI/Build] Update PyTorch to 2.4.0 (#6951)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml            | 6 +++---
 .github/workflows/publish.yml            | 2 +-
 CMakeLists.txt                           | 2 +-
 Dockerfile                               | 2 +-
 pyproject.toml                           | 2 +-
 requirements-build.txt                   | 2 +-
 requirements-cuda.txt                    | 8 ++++----
 vllm/model_executor/layers/ops/sample.py | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9ec9ec12bfcfe..573c3740f0bbb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -44,7 +44,7 @@ steps:
   fast_check: true
   commands:
   # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
@@ -164,7 +164,7 @@ steps:
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test
@@ -281,7 +281,7 @@ steps:
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 15c2ec05b25db..607fda754bf23 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.3.1']  # Must be the most recent version that meets requirements-cuda.txt.
+          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d599c5470704..9c2cb360fca30 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
 
 #
diff --git a/Dockerfile b/Dockerfile
index db4453ab0efc9..7294707046abc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -192,7 +192,7 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb
     python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
 
 
diff --git a/pyproject.toml b/pyproject.toml
index b0d115a091c45..26d963aa51099 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.3.1",
+    "torch == 2.4.0",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index b05f38a0ed919..d0f677fd344ed 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.3.1
+torch==2.4.0
 wheel
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 3eb91212e976e..1f60d54083b4c 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,8 +4,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.3.1
+torch == 2.4.0
 # These must be updated alongside torch
-torchvision == 0.18.1   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.27  # Requires PyTorch 2.3.1
-vllm-flash-attn == 2.5.9.post1  # Requires PyTorch 2.3.1
+torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27.post2  # Requires PyTorch 2.4.0
+vllm-flash-attn == 2.6.0  # Requires PyTorch 2.4.0
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index bdb577da3172f..fb88a05daf482 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -7,7 +7,7 @@
 from vllm.model_executor.layers.ops.rand import seeded_uniform
 from vllm.triton_utils.sample import get_num_triton_sampler_splits
 
-_EPS = 1e-6
+_EPS: tl.constexpr = 1e-6
 
 
 def _multi_split_sample(

From 2dd34371a6054966d30971dae89b0c431d7f0f08 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Fri, 2 Aug 2024 03:00:28 +0800
Subject: [PATCH 122/167] [Bugfix] Fix RMSNorm forward in InternViT attention
 qk_layernorm (#6992)

---
 vllm/model_executor/models/intern_vit.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 86d0930d80126..c6c692deca2e1 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -113,10 +113,10 @@ def forward(self, x):
 
         if self.qk_normalization:
             B_, H_, N_, D_ = q.shape
-            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(
-                B_, N_, H_, D_).transpose(1, 2)
-            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(
-                B_, N_, H_, D_).transpose(1, 2)
+            q = self.q_norm.forward_native(q.transpose(1, 2).flatten(
+                -2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm.forward_native(k.transpose(1, 2).flatten(
+                -2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
 
         x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
         x = x.transpose(1, 2).reshape(B, N, C)

From fb3db616881d7225c4bbe64bb709ea6bcd6157f7 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 1 Aug 2024 15:00:51 -0400
Subject: [PATCH 123/167] [CI/Build] Remove sparseml requirement from testing
 (#7037)

---
 requirements-test.txt                         |  1 -
 tests/conftest.py                             |  4 --
 tests/models/test_compressed_tensors.py       | 52 -------------------
 tests/quantization/test_compressed_tensors.py |  2 +-
 4 files changed, 1 insertion(+), 58 deletions(-)
 delete mode 100644 tests/models/test_compressed_tensors.py

diff --git a/requirements-test.txt b/requirements-test.txt
index 9b88fcce3e842..df247496be16c 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,7 +14,6 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
-sparseml==1.8.0 # required for compressed-tensors
 compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 59510075b0063..999ca60d07a4f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -152,7 +152,6 @@ def __init__(
         model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
-        is_sparseml_model: bool = False,
     ) -> None:
         torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
 
@@ -169,9 +168,6 @@ def __init__(
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
-            elif is_sparseml_model:
-                from sparseml.transformers import SparseAutoModelForCausalLM
-                auto_cls = SparseAutoModelForCausalLM
             else:
                 auto_cls = AutoModelForCausalLM
 
diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
deleted file mode 100644
index da47d5f3f3d23..0000000000000
--- a/tests/models/test_compressed_tensors.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""Compares vllm vs sparseml for compressed-tensors
-
-Note: vllm and sparseml do not have bitwise correctness, 
-so in this test, we just confirm that the top selected 
-tokens of the are in the top 5 selections of each other.
-"""
-
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-
-from .utils import check_logprobs_close
-
-MODELS = [
-    # No bias
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
-    # Bias
-    "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
-]
-
-MAX_TOKENS = 32
-NUM_LOGPROBS = 5
-
-
-@pytest.mark.skipif(
-    not is_quant_method_supported("compressed-tensors"),
-    reason="compressed-tensors is not supported on this machine type.")
-@pytest.mark.parametrize("model_name", MODELS)
-def test_models(
-    vllm_runner,
-    hf_runner,
-    example_prompts,
-    model_name,
-) -> None:
-    # Run sparseml.
-    with hf_runner(model_name=model_name,
-                   is_sparseml_model=True) as sparseml_model:
-
-        sparseml_outputs = sparseml_model.generate_greedy_logprobs_limit(
-            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
-
-    # Run vllm.
-    with vllm_runner(model_name=model_name) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=sparseml_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="sparseml",
-        name_1="vllm",
-    )
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index c5a01b73f4a80..bd79da84a7764 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -1,4 +1,4 @@
-"""Test model set-up and weight loading for sparseml-quantized models.
+"""Test model set-up and weight loading for llmcompressor-quantized models.
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """

From f4fd390f5de585fd94877158bea4e1b2d1920df3 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 1 Aug 2024 15:01:07 -0400
Subject: [PATCH 124/167] [Bugfix] Lower gemma's unloaded_params exception to
 warning (#7002)

---
 vllm/model_executor/models/gemma.py     | 6 +++---
 vllm/model_executor/models/gemma2.py    | 9 ++++++---
 vllm/model_executor/models/paligemma.py | 6 +++---
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 7e0888b5f5abd..64aef1024a1a5 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -404,6 +404,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             loaded_params.add(name)
         unloaded_params = params_dict.keys() - loaded_params
         if unloaded_params:
-            raise RuntimeError(
-                "Some weights are not initialized from checkpoints: "
-                f"{unloaded_params}")
+            logger.warning(
+                "Some weights are not initialized from checkpoints: %s",
+                unloaded_params)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 8386084c2b3f8..b77c901f6cd32 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -23,6 +23,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -41,6 +42,8 @@
 
 from .interfaces import SupportsLoRA
 
+logger = init_logger(__name__)
+
 
 class Gemma2MLP(nn.Module):
 
@@ -390,6 +393,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         unloaded_params = params_dict.keys() - loaded_params
         if unloaded_params:
-            raise RuntimeError(
-                "Some weights are not initialized from checkpoints: "
-                f"{unloaded_params}")
+            logger.warning(
+                "Some weights are not initialized from checkpoints: %s",
+                unloaded_params)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 2af48b6bc190f..fe91611cd30ff 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -342,6 +342,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         unloaded_params = params_dict.keys() - loaded_params
         if unloaded_params:
-            raise RuntimeError(
-                "Some weights are not initialized from checkpoints: "
-                f"{unloaded_params}")
+            logger.warning(
+                "Some weights are not initialized from checkpoints: %s",
+                unloaded_params)

From fc912e0886f5eaa584c1a65fad81c6c269f609a0 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Thu, 1 Aug 2024 12:40:43 -0700
Subject: [PATCH 125/167] [Models] Support Qwen model with PP (#6974)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
---
 docs/source/serving/distributed_serving.rst |  2 +-
 vllm/config.py                              |  1 +
 vllm/model_executor/models/qwen.py          | 54 +++++++++++++++++----
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 5f14fd2b0ee0a..fcb2646df50d3 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -50,7 +50,7 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip
     $     --pipeline-parallel-size 2
 
 .. note::
-    Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, and Mixtral style models.
+    Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models.
 
 Multi-Node Inference and Serving
 --------------------------------
diff --git a/vllm/config.py b/vllm/config.py
index e065744592378..ef56e2b6395be 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -42,6 +42,7 @@
     "NemotronForCausalLM",
     "Qwen2ForCausalLM",
     "Qwen2MoeForCausalLM",
+    "QWenLMHeadModel",
 ]
 
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 47c85c783db7a..eb61adf34e9a7 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -12,7 +12,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -30,6 +30,8 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
+from .utils import is_pp_missing_parameter, make_layers
+
 
 class QWenMLP(nn.Module):
 
@@ -186,6 +188,7 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -195,10 +198,10 @@ def __init__(
             config.vocab_size,
             config.hidden_size,
         )
-        self.h = nn.ModuleList([
-            QWenBlock(config, cache_config, quant_config)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: QWenBlock(config, cache_config, quant_config),
+            prefix=f"{prefix}.h")
         self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(
@@ -207,18 +210,29 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
     ) -> torch.Tensor:
-        hidden_states = self.wte(input_ids)
-        residual = None
-        for i in range(len(self.h)):
+        if get_pp_group().is_first_rank:
+            hidden_states = self.wte(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
             layer = self.h[i]
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i],
+                kv_caches[i - self.start_layer],
                 attn_metadata,
                 residual,
             )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
 
@@ -250,9 +264,23 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata)
+                                         attn_metadata, intermediate_tensors)
         return hidden_states
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states,
@@ -284,6 +312,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -301,6 +332,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                         "Only text inputs are allowed. Images won't be handled "
                         "until Qwen-VL models are fully supported.")
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From 562e580abc63cd6c1d39bd04d7a007ddefba7575 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Thu, 1 Aug 2024 15:12:37 -0500
Subject: [PATCH 126/167] Update run-amd-test.sh (#7044)

---
 .buildkite/run-amd-test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 85b2b6b503531..ccc2f090565e4 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -55,7 +55,6 @@ while true; do
 done
 
 echo "--- Pulling container" 
-docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull ${image_name}

From 805a8a75f2f17ee56c0882efcc34d35e1801cbee Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 1 Aug 2024 13:14:37 -0700
Subject: [PATCH 127/167] [Misc] Support attention logits soft-capping with
 flash-attn (#7022)

---
 requirements-cuda.txt                       |  2 +-
 tests/kernels/test_flash_attn.py            | 19 +++++++++++++------
 vllm/attention/backends/abstract.py         |  1 +
 vllm/attention/backends/blocksparse_attn.py |  3 +++
 vllm/attention/backends/flash_attn.py       | 21 ++++++++++-----------
 vllm/attention/backends/flashinfer.py       | 14 +++++---------
 vllm/attention/backends/ipex_attn.py        |  8 ++++++--
 vllm/attention/backends/pallas.py           |  4 ++++
 vllm/attention/backends/rocm_flash_attn.py  | 10 ++++++++--
 vllm/attention/backends/torch_sdpa.py       |  8 ++++++--
 vllm/attention/backends/utils.py            |  9 ---------
 vllm/attention/backends/xformers.py         |  9 +++++++--
 vllm/attention/layer.py                     |  3 ++-
 vllm/model_executor/models/gemma2.py        |  7 +++++--
 14 files changed, 71 insertions(+), 47 deletions(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 1f60d54083b4c..1d00f0c17deec 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -8,4 +8,4 @@ torch == 2.4.0
 # These must be updated alongside torch
 torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.27.post2  # Requires PyTorch 2.4.0
-vllm-flash-attn == 2.6.0  # Requires PyTorch 2.4.0
+vllm-flash-attn == 2.6.1  # Requires PyTorch 2.4.0
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index cd06c27175cef..6c5eff00de44c 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -20,6 +20,7 @@ def ref_paged_attn(
     block_tables: torch.Tensor,
     scale: float,
     sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -53,6 +54,8 @@ def ref_paged_attn(
                                              (query_len + sliding_window) +
                                              1).bool().logical_not()
             mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
         attn.masked_fill_(mask, float("-inf"))
         attn = torch.softmax(attn, dim=-1).to(v.dtype)
         out = torch.einsum("hqk,khd->qhd", attn, v)
@@ -68,13 +71,15 @@ def ref_paged_attn(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@torch.inference_mode
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@torch.inference_mode()
 def test_flash_attn_with_paged_kv(
     kv_lens: List[int],
     num_heads: Tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
+    soft_cap: Optional[float],
 ) -> None:
     torch.set_default_device("cuda")
     torch.cuda.manual_seed_all(0)
@@ -108,6 +113,7 @@ def test_flash_attn_with_paged_kv(
         causal=True,
         block_table=block_tables,
         cache_seqlens=kv_lens_tensor,
+        softcap=soft_cap if soft_cap is not None else 0,
     ).squeeze(1)
 
     ref_output = ref_paged_attn(
@@ -118,6 +124,7 @@ def test_flash_attn_with_paged_kv(
         kv_lens=kv_lens,
         block_tables=block_tables,
         scale=scale,
+        soft_cap=soft_cap,
     )
     assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
@@ -129,7 +136,8 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("sliding_window", [None])
 @pytest.mark.parametrize("dtype", DTYPES)
-@torch.inference_mode
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@torch.inference_mode()
 def test_varlen_with_paged_kv(
     seq_lens: List[Tuple[int, int]],
     num_heads: Tuple[int, int],
@@ -137,6 +145,7 @@ def test_varlen_with_paged_kv(
     sliding_window: Optional[int],
     dtype: torch.dtype,
     block_size: int,
+    soft_cap: Optional[float],
 ) -> None:
     torch.set_default_device("cuda")
     torch.cuda.manual_seed_all(0)
@@ -163,10 +172,6 @@ def test_varlen_with_paged_kv(
                             head_size,
                             dtype=dtype)
     value_cache = torch.randn_like(key_cache)
-    # Normalize the scale of the key and value caches to mitigate
-    # numerical instability.
-    key_cache /= head_size**0.5
-    value_cache /= head_size**0.5
     cu_query_lens = torch.tensor([0] + query_lens,
                                  dtype=torch.int32).cumsum(dim=0,
                                                            dtype=torch.int32)
@@ -192,6 +197,7 @@ def test_varlen_with_paged_kv(
         causal=True,
         window_size=window_size,
         block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
     )
 
     ref_output = ref_paged_attn(
@@ -203,6 +209,7 @@ def test_varlen_with_paged_kv(
         block_tables=block_tables,
         scale=scale,
         sliding_window=sliding_window,
+        soft_cap=soft_cap,
     )
     assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 106b00cc1014c..97b13917ccfaa 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -150,6 +150,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         kv_cache_dtype: str = "auto",
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 71954f864a9b4..907b45393eeb5 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -283,12 +283,15 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
             "Alibi not support for blocksparse flash attention.")
         assert sliding_window is None, ValueError(
             "sliding_window is invalid for blocksparse attention.")
+        assert logits_soft_cap is None, ValueError(
+            "logits_soft_cap is invalid for blocksparse attention.")
 
         if "num_heads" not in blocksparse_params:
             blocksparse_params["num_heads"] = num_heads
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 7d7aff9dc3cdc..00654dca2adfa 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -288,15 +288,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1
 
-        logits_soft_cap = getattr(self.runner.model_config.hf_config,
-                                  "attn_logit_softcapping", None)
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "Please use Flashinfer backend for models with logits_soft_cap"
-                " (i.e., Gemma-2). Otherwise, the output might be wrong."
-                " Set Flashinfer backend by "
-                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
-
         max_query_len = max(query_lens)
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
@@ -405,9 +396,11 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "FlashAttention does not support block-sparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "FlashAttention does not support block-sparse attention.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -418,6 +411,10 @@ def __init__(
         self.sliding_window = ((sliding_window, sliding_window)
                                if sliding_window is not None else (-1, -1))
         self.kv_cache_dtype = kv_cache_dtype
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -525,6 +522,7 @@ def forward(
                     causal=True,
                     window_size=self.sliding_window,
                     alibi_slopes=self.alibi_slopes,
+                    softcap=self.logits_soft_cap,
                 )
                 assert output[:num_prefill_tokens].shape == out.shape
                 output[:num_prefill_tokens] = out
@@ -544,6 +542,7 @@ def forward(
                     causal=True,
                     alibi_slopes=self.alibi_slopes,
                     block_table=prefill_meta.block_tables,
+                    softcap=self.logits_soft_cap,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 83a420d76834b..ccf8ab03a621b 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -116,8 +116,6 @@ class FlashInferMetadata(AttentionMetadata):
     # The data type of the paged kv cache
     data_type: torch.dtype = None
     device: torch.device = torch.device("cuda")
-    # Only used by gemma2 model
-    logits_soft_cap: Optional[float] = None
 
     def __post_init__(self):
         # Refer to
@@ -391,9 +389,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                            dtype=torch.long,
                                            device=device)
 
-        logits_soft_cap = getattr(self.runner.model_config.hf_config,
-                                  "attn_logit_softcapping", None)
-
         if len(self.paged_kv_indptr) > 0:
             paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
                                                    device="cpu",
@@ -430,8 +425,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             query_start_loc=query_start_loc,
             device=device,
             data_type=kv_cache_dtype,
-            use_cuda_graph=use_captured_graph,
-            logits_soft_cap=logits_soft_cap)
+            use_cuda_graph=use_captured_graph)
 
 
 class FlashInferImpl(AttentionImpl):
@@ -446,6 +440,7 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -458,6 +453,7 @@ def __init__(
             raise ValueError("Sliding window is not supported in FlashInfer.")
         self.sliding_window = (-1, -1)
         self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -532,7 +528,7 @@ def forward(
                 output = prefill_meta.prefill_wrapper.forward(
                     query,
                     kv_cache,
-                    logits_soft_cap=attn_metadata.logits_soft_cap,
+                    logits_soft_cap=self.logits_soft_cap,
                     causal=True)
         else:
             assert attn_metadata.decode_metadata is not None
@@ -541,5 +537,5 @@ def forward(
                 query,
                 kv_cache,
                 sm_scale=self.scale,
-                logits_soft_cap=attn_metadata.logits_soft_cap)
+                logits_soft_cap=self.logits_soft_cap)
         return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 4559dd15f600c..bac30aec24826 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -105,9 +105,13 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "Torch SPDA does not support block-sparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "IPEX backend does not support block-sparse attention.")
+        if logits_soft_cap is not None:
+            raise ValueError("IPEX backend does not support logits_soft_cap.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 2269ac2606e89..4ecf698c8d514 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -91,6 +91,7 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -109,6 +110,9 @@ def __init__(
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
+        if logits_soft_cap is not None:
+            raise NotImplementedError(
+                "Attention logits soft-capping is not supported.")
 
         if torch_xla.tpu.version() < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 058c8df0eaf8b..26e9b8a93fb9a 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -244,9 +244,15 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "ROCFlashAttention does not support blocksparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "ROCmFlashAttention does not support blocksparse attention.")
+        if logits_soft_cap is not None:
+            raise ValueError(
+                "ROCmFlashAttention does not support attention logits soft "
+                "capping.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index fe6a56123ce72..b83c673f0165e 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -109,9 +109,13 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "Torch SPDA does not support block-sparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "Torch SPDA does not support block-sparse attention.")
+        if logits_soft_cap is not None:
+            raise ValueError("Torch SPDA does not support logits soft cap.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index dcd10ed410a79..bca1370343b7b 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -165,15 +165,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1
 
-        logits_soft_cap = getattr(self.runner.model_config.hf_config,
-                                  "attn_logit_softcapping", None)
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "Please use Flashinfer backend for models with logits_soft_cap "
-                "(i.e., Gemma-2). Otherwise, the output might be wrong. "
-                "Set Flashinfer backend by "
-                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
-
         max_query_len = max(query_lens)
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 1573cd7da94cd..24ba5fc725403 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -408,9 +408,14 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "XFormer does not support block-sparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "XFormers does not support block-sparse attention.")
+        if logits_soft_cap is not None:
+            raise ValueError(
+                "XFormers does not support attention logits soft capping.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 5fa552f2f4eca..2c21502dcf407 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -34,6 +34,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -82,7 +83,7 @@ def __init__(
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params)
+                             blocksparse_params, logits_soft_cap)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index b77c901f6cd32..7bad2626fec6a 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -90,7 +90,8 @@ def __init__(self,
                  max_position_embeddings: int,
                  rope_theta: float,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 quant_config: Optional[QuantizationConfig] = None,
+                 attn_logits_soft_cap: Optional[float] = None) -> None:
         super().__init__()
         self.layer_idx = layer_idx
         self.config = config
@@ -150,7 +151,8 @@ def __init__(self,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              logits_soft_cap=attn_logits_soft_cap)
 
     def forward(
         self,
@@ -189,6 +191,7 @@ def __init__(
             rope_theta=config.rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
+            attn_logits_soft_cap=config.attn_logit_softcapping,
         )
         self.hidden_size = config.hidden_size
         self.mlp = Gemma2MLP(

From 6a11fdfbb8d6701c7ad38648aead23d8cbe6aac5 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 1 Aug 2024 16:51:15 -0400
Subject: [PATCH 128/167] [CI/Build][Bugfix] Fix CUTLASS header-only line
 (#7034)

---
 CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9c2cb360fca30..77a8af549b027 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,12 +156,15 @@ set(VLLM_EXT_SRC
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   include(FetchContent)
-  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
         # CUTLASS 3.5.0
         GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+        # Shallow clone with depth 1
+        GIT_SHALLOW TRUE
+        GIT_PROGRESS TRUE
   )
   FetchContent_MakeAvailable(cutlass)
 

From 6ce01f30667bbae33f112152e07a3b66b841078f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 1 Aug 2024 18:29:52 -0700
Subject: [PATCH 129/167] [Performance] Optimize `get_seqs` (#7051)

---
 vllm/core/block_manager_v1.py          |  2 +-
 vllm/sequence.py                       | 40 +++++++++++++-------------
 vllm/transformers_utils/detokenizer.py |  2 +-
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index e29eba375f4dd..d81648caa5851 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -700,5 +700,5 @@ def get_common_computed_block_ids(
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         if self.enable_caching:
-            for seq in seq_group.seqs_dict.values():
+            for seq in seq_group.get_seqs():
                 self.compute_full_blocks_in_seq(seq)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ab50cfdfd29a5..7ef9387c611f8 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -444,6 +444,7 @@ def __init__(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         self.request_id = request_id
+        self.seqs = seqs
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
         self.sampling_params = sampling_params
         self.metrics = RequestMetrics(arrival_time=arrival_time,
@@ -458,25 +459,24 @@ def __init__(
         self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
-        self._first_seq = next(iter(self.seqs_dict.values()))
 
     @property
     def prompt(self) -> Optional[str]:
         # All sequences in the group should have the same prompt.
         # We use the prompt of an arbitrary sequence.
-        return self._first_seq.prompt
+        return self.seqs[0].prompt
 
     @property
     def prompt_token_ids(self) -> List[int]:
         # All sequences in the group should have the same prompt.
         # We use the prompt of an arbitrary sequence.
-        return self._first_seq.prompt_token_ids
+        return self.seqs[0].prompt_token_ids
 
     @property
     def multi_modal_data(self) -> "MultiModalDataDict":
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
-        return self._first_seq.multi_modal_data
+        return self.seqs[0].multi_modal_data
 
     @property
     def lora_int_id(self) -> int:
@@ -512,7 +512,7 @@ def maybe_set_first_token_time(self, time: float) -> None:
         #   in TPOT, rather than recalculating TTFT (since from the )
         #   POV of the user, there is simply a long generation delay.
         if (self.metrics.first_token_time is None
-                and self.get_seqs()[0].get_output_len() == 1):
+                and self.seqs[0].get_output_len() == 1):
             self.metrics.first_token_time = time
 
     def maybe_set_first_scheduled_time(self, time: float) -> None:
@@ -548,9 +548,9 @@ def get_seqs(
         self,
         status: Optional[SequenceStatus] = None,
     ) -> List[Sequence]:
-        return list(self.seqs_dict.values()) if status is None else [
-            seq for seq in self.seqs_dict.values() if seq.status == status
-        ]
+        if status is None:
+            return self.seqs
+        return [seq for seq in self.seqs if seq.status == status]
 
     def is_encoder_decoder(self) -> bool:
         return self.encoder_seq is not None
@@ -559,22 +559,20 @@ def get_encoder_seq(self) -> Optional[Sequence]:
         return self.encoder_seq
 
     def get_unfinished_seqs(self) -> List[Sequence]:
-        return [
-            seq for seq in self.seqs_dict.values() if not seq.is_finished()
-        ]
+        return [seq for seq in self.seqs if not seq.is_finished()]
 
     def get_finished_seqs(self) -> List[Sequence]:
-        return [seq for seq in self.seqs_dict.values() if seq.is_finished()]
+        return [seq for seq in self.seqs if seq.is_finished()]
 
     def update_num_computed_tokens(self, num_new_computed_tokens: int):
         """Update number of tokens computed so far."""
-        for seq in self.seqs_dict.values():
+        for seq in self.seqs:
             if not seq.is_finished():
                 seq.data.update_num_computed_tokens(num_new_computed_tokens)
 
     def get_num_uncomputed_tokens(self) -> int:
         num_uncomputed_tokens = 0
-        for seq in self.get_seqs():
+        for seq in self.seqs:
             if not seq.is_finished():
                 num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
         return num_uncomputed_tokens
@@ -583,7 +581,7 @@ def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
         # Optimization. We don't need to call get_seqs if we don't need to
         # filter by states.
         if status is None:
-            return len(self.seqs_dict)
+            return len(self.seqs)
 
         return len(self.get_seqs(status))
 
@@ -602,23 +600,25 @@ def add(self, seq: Sequence) -> None:
         if seq.seq_id in self.seqs_dict:
             raise ValueError(f"Sequence {seq.seq_id} already exists.")
         self.seqs_dict[seq.seq_id] = seq
+        self.seqs.append(seq)
 
     def remove(self, seq_id: int) -> None:
-        if seq_id not in self.seqs_dict:
+        seq = self.seqs_dict.pop(seq_id, None)
+        if seq is None:
             raise ValueError(f"Sequence {seq_id} not found.")
-        del self.seqs_dict[seq_id]
+        self.seqs.remove(seq)
 
     def is_finished(self) -> bool:
-        return all(seq.is_finished() for seq in self.get_seqs())
+        return all(seq.is_finished() for seq in self.seqs)
 
     def is_prefill(self) -> bool:
         # Every sequence should be in the same stage.
-        return self.get_seqs()[0].is_prefill()
+        return self.seqs[0].is_prefill()
 
     def __repr__(self) -> str:
         return (f"SequenceGroup(request_id={self.request_id}, "
                 f"sampling_params={self.sampling_params}, "
-                f"num_seqs={len(self.seqs_dict)})")
+                f"num_seqs={len(self.seqs)})")
 
 
 class SequenceGroupMetadata:
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 76f418674532f..001af67f3bb9e 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -40,7 +40,7 @@ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
         assert prms is not None
 
         # We can pick any sequence for the prompt.
-        seq = next(iter(seq_group.seqs_dict.values()))
+        seq = seq_group.get_seqs()[0]
         # Only prompt, without the generated token.
         all_token_ids = seq.get_token_ids()
         prompt_token_ids = all_token_ids[:-1]

From 954f7305a106058815bd7e47f5b9d585d8764c05 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Thu, 1 Aug 2024 18:44:16 -0700
Subject: [PATCH 130/167] [Kernel] Fix input for flashinfer prefill wrapper.
 (#7008)

---
 vllm/attention/backends/flashinfer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index ccf8ab03a621b..91abaab78dcb8 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -133,13 +133,20 @@ def begin_forward(self):
                 return
 
             assert self.prefill_wrapper is not None
+            assert self.query_start_loc is not None
             assert self.paged_kv_indices is not None
             assert self.paged_kv_indptr is not None
             assert self.paged_kv_last_page_len is not None
-            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            batch_size = self.query_start_loc.shape[0] - 1
+            assert batch_size >= 0
+            # The prefill stage does not read kv cache.
+            # Both paged_kv_indices and paged_kv_last_page_len are empty.
+            # paged_kv_indptr is a zero tensor with size batch_size + 1.
+            self.paged_kv_indptr = torch.zeros(batch_size + 1,
+                                               device=self.device)
             self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
                 self.device)
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
             self.prefill_wrapper.end_forward()
             self.prefill_wrapper.begin_forward(
                 self.query_start_loc, self.paged_kv_indptr,

From 3bb4b1e4cd3d07c80a208d875b016631d91844f8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 2 Aug 2024 10:49:43 +0800
Subject: [PATCH 131/167] [mypy] Speed up mypy checking (#7056)

---
 .github/workflows/mypy.yaml | 2 +-
 format.sh                   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 721c9c026cf16..68e3a3fefdc5d 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,6 +32,7 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
+        mypy
         mypy tests --follow-imports skip
         mypy vllm/attention --follow-imports skip
         mypy vllm/core --follow-imports skip
@@ -44,5 +45,4 @@ jobs:
         mypy vllm/prompt_adapter --follow-imports skip
         mypy vllm/spec_decode --follow-imports skip
         mypy vllm/worker --follow-imports skip
-        mypy
 
diff --git a/format.sh b/format.sh
index 71697cffacfb4..abc688c702aa6 100755
--- a/format.sh
+++ b/format.sh
@@ -96,6 +96,7 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
+mypy --follow-imports skip  # Note that this is less strict than CI
 mypy tests --follow-imports skip
 mypy vllm/attention --follow-imports skip
 mypy vllm/core --follow-imports skip
@@ -108,7 +109,7 @@ mypy vllm/model_executor  --follow-imports skip
 mypy vllm/prompt_adapter --follow-imports skip
 mypy vllm/spec_decode --follow-imports skip
 mypy vllm/worker --follow-imports skip
-mypy
+echo 'vLLM mypy: Done'
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
@@ -127,7 +128,7 @@ spell_check_all(){
   codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
 }
 
-# Spelling  check of files that differ from main branch.
+# Spelling check of files that differ from main branch.
 spell_check_changed() {
     # The `if` guard ensures that the list of filenames is not empty, which
     # could cause ruff to receive 0 positional arguments, making it hang

From 252357793dd1fe9d30c34e68e4b8b2143a4c5138 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 1 Aug 2024 22:03:12 -0700
Subject: [PATCH 132/167] [ci][distributed] try to fix pp test (#7054)

---
 tests/distributed/test_pipeline_parallel.py   |  4 +-
 tests/utils.py                                | 39 +++++++++++++++++++
 .../tokenizer_group/ray_tokenizer_group.py    |  2 +-
 vllm/utils.py                                 |  3 +-
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5ff39ddfbf996..f632caba9017e 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from ..utils import compare_two_settings
+from ..utils import compare_two_settings, fork_new_process_for_each_test
 
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
@@ -28,6 +28,7 @@
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
     ])
+@fork_new_process_for_each_test
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
                     DIST_BACKEND):
     if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
@@ -77,6 +78,7 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
     "FLASH_ATTN",
     "FLASHINFER",
 ])
+@fork_new_process_for_each_test
 def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
     cudagraph_args = [
         # use half precision for speed and memory savings in CI environment
diff --git a/tests/utils.py b/tests/utils.py
index 1086591464d43..f3ee801ee7742 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,6 @@
+import functools
 import os
+import signal
 import subprocess
 import sys
 import time
@@ -336,3 +338,40 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
                              f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
 
         time.sleep(5)
+
+
+def fork_new_process_for_each_test(f):
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        # Make the process the leader of its own process group
+        # to avoid sending SIGTERM to the parent process
+        os.setpgrp()
+        from _pytest.outcomes import Skipped
+        pid = os.fork()
+        if pid == 0:
+            try:
+                f(*args, **kwargs)
+            except Skipped as e:
+                # convert Skipped to exit code 0
+                print(str(e))
+                os._exit(0)
+            except Exception:
+                import traceback
+                traceback.print_exc()
+                os._exit(1)
+            else:
+                os._exit(0)
+        else:
+            pgid = os.getpgid(pid)
+            _pid, _exitcode = os.waitpid(pid, 0)
+            # ignore SIGTERM signal itself
+            old_singla_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
+            # kill all child processes
+            os.killpg(pgid, signal.SIGTERM)
+            # restore the signal handler
+            signal.signal(signal.SIGTERM, old_singla_handler)
+            assert _exitcode == 0, (f"function {f} failed when called with"
+                                    f" args {args} and kwargs {kwargs}")
+
+    return wrapper
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index eebdf7bf644d0..79081c04ddc14 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -3,7 +3,7 @@
 from typing import List, Optional
 
 try:
-    from ray.exceptions import ActorDiedError
+    from ray.exceptions import ActorDiedError  # type: ignore
 except ImportError:
     # For older versions of Ray
     from ray.exceptions import RayActorError as ActorDiedError  # type: ignore
diff --git a/vllm/utils.py b/vllm/utils.py
index 38e1782a51ab9..358788c95f30e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -928,7 +928,8 @@ def error_on_invalid_device_count_status():
     with contextlib.suppress(Exception):
         # future pytorch will fix the issue, device_count will not be cached
         # at that time, `.cache_info().currsize` will error out
-        cache_entries = torch.cuda.device_count.cache_info().currsize
+        cache_entries = torch.cuda.device_count.cache_info(  # type: ignore
+        ).currsize
     if cache_entries != 0:
         # the function is already called, and the result is cached
         remembered = torch.cuda.device_count()

From cf2a1a4d9d8168d2e8e7bef244c1dfec80780405 Mon Sep 17 00:00:00 2001
From: Bongwon Jang <152451401+bong-furiosa@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:28:00 +0900
Subject: [PATCH 133/167] Fix tracing.py (#7065)

---
 vllm/tracing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/tracing.py b/vllm/tracing.py
index ba6732cab68f2..dc8377f2396f2 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -15,7 +15,7 @@
         OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
     from opentelemetry.sdk.trace import TracerProvider
     from opentelemetry.sdk.trace.export import BatchSpanProcessor
-    from opentelemetry.semconv.ai import SpanAttributes as BaseSpanAttributes
+    from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes
     from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
     from opentelemetry.trace.propagation.tracecontext import (
         TraceContextTextMapPropagator)

From 660dea1235bfe8987e4e9136ce70269084384b2f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 00:14:21 -0700
Subject: [PATCH 134/167] [cuda][misc] remove
 error_on_invalid_device_count_status (#7069)

---
 vllm/executor/multiproc_gpu_executor.py |  3 ---
 vllm/executor/ray_gpu_executor.py       |  9 +++------
 vllm/utils.py                           | 23 -----------------------
 3 files changed, 3 insertions(+), 32 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index e1e92958e667c..08a35a074b37b 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -17,7 +17,6 @@
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.triton_utils import maybe_set_triton_cache_manager
 from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        error_on_invalid_device_count_status,
                         get_distributed_init_method, get_open_port,
                         get_vllm_instance_id, make_async,
                         update_environment_variables)
@@ -79,8 +78,6 @@ def _init_executor(self) -> None:
             f"please ensure that world_size ({world_size}) "
             f"is less than than max local gpu count ({cuda_device_count})")
 
-        error_on_invalid_device_count_status()
-
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 564fa79acfd40..14007e5518d4a 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -10,10 +10,9 @@
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (_run_task_with_lock,
-                        error_on_invalid_device_count_status,
-                        get_distributed_init_method, get_ip, get_open_port,
-                        get_vllm_instance_id, make_async)
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, get_vllm_instance_id,
+                        make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -216,8 +215,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
-        error_on_invalid_device_count_status()
-
         # Initialize the actual workers inside worker wrapper.
         init_worker_all_kwargs = [
             self._get_worker_kwargs(
diff --git a/vllm/utils.py b/vllm/utils.py
index 358788c95f30e..c4c17bfbefc65 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,6 +1,5 @@
 import argparse
 import asyncio
-import contextlib
 import datetime
 import enum
 import gc
@@ -923,28 +922,6 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
-def error_on_invalid_device_count_status():
-    cache_entries = 0
-    with contextlib.suppress(Exception):
-        # future pytorch will fix the issue, device_count will not be cached
-        # at that time, `.cache_info().currsize` will error out
-        cache_entries = torch.cuda.device_count.cache_info(  # type: ignore
-        ).currsize
-    if cache_entries != 0:
-        # the function is already called, and the result is cached
-        remembered = torch.cuda.device_count()
-        current = cuda_device_count_stateless()
-        if remembered > current:
-            raise RuntimeError(
-                "The number of CUDA devices has changed since the first "
-                "call to torch.cuda.device_count(). This is not allowed "
-                "and may result in undefined behavior. Please check out "
-                "https://github.com/vllm-project/vllm/issues/6056 to "
-                "find the first call to torch.cuda.device_count() "
-                "and defer it until the engine is up. Or you can set "
-                "CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
-
-
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
 # all the related functions work on real physical device ids.

From db35186391a2abfc6c91d703527dac20d2488107 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Fri, 2 Aug 2024 15:58:26 +0800
Subject: [PATCH 135/167] [Core] Comment out unused code in sampler (#7023)

---
 vllm/model_executor/sampling_metadata.py | 58 +++++++++++++-----------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 59cfec9ec8934..015e85b4ca81d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -13,6 +13,8 @@
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558
+# Some triton sampler related code is guarded before it is ready.
+_USE_TRITON_SAMPLER = False
 
 
 @dataclass
@@ -347,14 +349,16 @@ def from_sampling_metadata(
         repetition_penalties: List[float] = []
         sampling_seeds: List[int] = []
         sample_indices: List[int] = []
-        prompt_best_of: List[int] = []
         do_penalties = False
         do_top_p_top_k = False
         do_min_p = False
 
-        # We need one base seed per Triton slice.
-        seeds_to_generate = (extra_seeds_to_generate +
-                             get_num_triton_sampler_splits(vocab_size))
+        if _USE_TRITON_SAMPLER:
+            prompt_best_of: List[int] = []
+
+            # We need one base seed per Triton slice.
+            seeds_to_generate = (extra_seeds_to_generate +
+                                 get_num_triton_sampler_splits(vocab_size))
 
         assert sampling_metadata.seq_groups is not None
         for seq_group in sampling_metadata.seq_groups:
@@ -366,9 +370,6 @@ def from_sampling_metadata(
             r = sampling_params.repetition_penalty
             top_p = sampling_params.top_p
             min_p = sampling_params.min_p
-            seed = sampling_params.seed
-
-            is_greedy = sampling_params.sampling_type == SamplingType.GREEDY
 
             # k should not be greater than the vocab size.
             top_k = min(sampling_params.top_k, vocab_size)
@@ -389,8 +390,7 @@ def from_sampling_metadata(
                 do_penalties = True
 
             is_prompt = seq_group.is_prompt
-            if (seq_group.is_prompt
-                    and sampling_params.prompt_logprobs is not None):
+            if (is_prompt and sampling_params.prompt_logprobs is not None):
                 # For tokens in the prompt that we only need to get
                 # their logprobs
                 query_len = seq_group.query_len
@@ -415,23 +415,27 @@ def from_sampling_metadata(
                 frequency_penalties += [f] * len(seq_ids)
                 repetition_penalties += [r] * len(seq_ids)
 
-            if is_prompt:
-                prompt_best_of.append(sampling_params.best_of)
-                query_len = seq_group.query_len
-                assert query_len is not None
-
-            for seq_id in seq_ids:
-                seq_data = seq_group.seq_data[seq_id]
-                extra_entropy = extra_entropy or ()
-                seq_seeds = cls._get_sequence_seeds(
-                    seed,
-                    seq_data.get_len(),
-                    *extra_entropy,
-                    seq_id,
-                    seeds_to_generate=seeds_to_generate,
-                    is_greedy=is_greedy)
-                sampling_seeds.append(seq_seeds)
-            sample_indices.extend(seq_group.sample_indices)
+            if _USE_TRITON_SAMPLER:
+                if is_prompt:
+                    prompt_best_of.append(sampling_params.best_of)
+                    query_len = seq_group.query_len
+                    assert query_len is not None
+
+                seed = sampling_params.seed
+                is_greedy = sampling_params.sampling_type == SamplingType.GREEDY
+
+                for seq_id in seq_ids:
+                    seq_data = seq_group.seq_data[seq_id]
+                    extra_entropy = extra_entropy or ()
+                    seq_seeds = cls._get_sequence_seeds(
+                        seed,
+                        seq_data.get_len(),
+                        *extra_entropy,
+                        seq_id,
+                        seeds_to_generate=seeds_to_generate,
+                        is_greedy=is_greedy)
+                    sampling_seeds.append(seq_seeds)
+                sample_indices.extend(seq_group.sample_indices)
 
         if do_penalties:
             for seq_group in sampling_metadata.seq_groups:
@@ -549,7 +553,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             device="cpu",
             dtype=torch.long,
             pin_memory=pin_memory,
-        ).T.contiguous()
+        ).t().contiguous()
 
         # Because the memory is pinned, we can do non-blocking
         # transfer to device.

From c16eaac5001d9e2bfb51c9812ec0c2b9e32b8d25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Fri, 2 Aug 2024 23:55:58 +0800
Subject: [PATCH 136/167] [Hardware][Intel CPU] Update torch 2.4.0 for CPU
 backend (#6931)

---
 Dockerfile.cpu       | 2 +-
 requirements-cpu.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index c473ba431e680..78730f39721cb 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -26,7 +26,7 @@ COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
 
-RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 2dcd86274a2a8..27ca8ca5dbc58 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,5 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.4.0; platform_machine != "ppc64le"
+torch == 2.4.0+cpu; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch

From 806949514ab07a2d7218645022c22962696adf46 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 10:03:24 -0700
Subject: [PATCH 137/167] [ci] set timeout for test_oot_registration.py (#7082)

---
 tests/entrypoints/openai/test_oot_registration.py | 4 ++++
 vllm/worker/worker.py                             | 4 +++-
 vllm/worker/xpu_worker.py                         | 4 +++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index dbbda6de1fa09..5272ac4065f1d 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -36,10 +36,12 @@ def test_oot_registration_for_api_server():
     ctx = torch.multiprocessing.get_context()
     server = ctx.Process(target=server_function, args=(port, ))
     server.start()
+    MAX_SERVER_START_WAIT_S = 60
     client = OpenAI(
         base_url=f"http://localhost:{port}/v1",
         api_key="token-abc123",
     )
+    now = time.time()
     while True:
         try:
             completion = client.chat.completions.create(
@@ -57,6 +59,8 @@ def test_oot_registration_for_api_server():
         except OpenAIError as e:
             if "Connection error" in str(e):
                 time.sleep(3)
+                if time.time() - now > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError("Server did not start in time") from e
             else:
                 raise e
     server.kill()
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f3c379d1aa34d..9e2cfff435cf6 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -186,7 +186,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory
         assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
         cache_block_size = self.get_cache_block_size_bytes()
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 6a822c2ba3e7a..0f22d67c4f254 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -138,7 +138,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory
         assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
         cache_block_size = self.get_cache_block_size_bytes()

From b482b9a5b13ba7d126adabbedb3ba66f48d4d83b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 2 Aug 2024 16:51:22 -0400
Subject: [PATCH 138/167] [CI/Build] Add support for Python 3.12 (#7035)

---
 .github/workflows/mypy.yaml                  | 2 +-
 .github/workflows/publish.yml                | 2 +-
 .github/workflows/ruff.yml                   | 2 +-
 .github/workflows/yapf.yml                   | 2 +-
 CMakeLists.txt                               | 2 +-
 docs/source/getting_started/installation.rst | 2 +-
 setup.py                                     | 1 +
 7 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 68e3a3fefdc5d..8d423657630c2 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 607fda754bf23..aeeaf6efab043 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -48,7 +48,7 @@ jobs:
       fail-fast: false
       matrix:
           os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11']
+          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
           pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 773def58fd966..1a794af572fef 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 04f307bcf8b0e..c89f82dfaaaf6 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77a8af549b027..dbe688186f17f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,7 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
 
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index 0253717da3cda..57ad8bacedfcc 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -9,7 +9,7 @@ Requirements
 ------------
 
 * OS: Linux
-* Python: 3.8 -- 3.11
+* Python: 3.8 -- 3.12
 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
 Install with pip
diff --git a/setup.py b/setup.py
index 63c1f466d2910..91307e8a94062 100644
--- a/setup.py
+++ b/setup.py
@@ -465,6 +465,7 @@ def _read_requirements(filename: str) -> List[str]:
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],

From a8d604ca2a2912b3a5352821c53c080383580df1 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 2 Aug 2024 16:51:58 -0400
Subject: [PATCH 139/167] [Misc] Disambiguate quantized types via a new
 ScalarType (#6396)

---
 CMakeLists.txt                                |  52 ++-
 Dockerfile.openvino                           |   3 +
 benchmarks/kernels/benchmark_marlin.py        |  50 +--
 cmake/cpu_extension.cmake                     |   1 -
 csrc/{ => core}/registration.h                |   0
 csrc/core/scalar_type.hpp                     | 382 ++++++++++++++++++
 csrc/core/torch_bindings.cpp                  |  16 +
 csrc/cpu/torch_bindings.cpp                   |   2 +-
 csrc/moe/torch_bindings.cpp                   |   2 +-
 csrc/ops.h                                    |   8 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  66 ++-
 .../marlin/sparse/marlin_24_cuda_kernel.cu    |  17 +-
 csrc/torch_bindings.cpp                       |   2 +-
 setup.py                                      |   9 +-
 tests/kernels/test_int8_quant.py              |   2 -
 tests/kernels/test_marlin_gemm.py             |  75 ++--
 tests/test_scalartype.py                      |  36 ++
 vllm/_core_ext.py                             | 177 ++++++++
 vllm/_custom_ops.py                           |  29 +-
 .../layers/quantization/awq_marlin.py         |  49 ++-
 .../schemes/compressed_tensors_w4a16_24.py    |  18 +-
 .../schemes/compressed_tensors_wNa16.py       |  29 +-
 .../layers/quantization/gptq_marlin.py        |  43 +-
 .../layers/quantization/gptq_marlin_24.py     |  29 +-
 .../layers/quantization/utils/marlin_utils.py | 120 +++---
 .../quantization/utils/marlin_utils_test.py   |  29 +-
 .../utils/marlin_utils_test_24.py             |  30 +-
 .../layers/quantization/utils/quant_utils.py  | 148 +++----
 vllm/scalar_type.py                           |  35 ++
 29 files changed, 1107 insertions(+), 352 deletions(-)
 rename csrc/{ => core}/registration.h (100%)
 create mode 100644 csrc/core/scalar_type.hpp
 create mode 100644 csrc/core/torch_bindings.cpp
 create mode 100644 tests/test_scalartype.py
 create mode 100644 vllm/_core_ext.py
 create mode 100644 vllm/scalar_type.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dbe688186f17f..922613ec5ddaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,6 +66,39 @@ endif()
 #
 find_package(Torch REQUIRED)
 
+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)
+message(STATUS "Enabling core extension.")
+
+# Define _core_C extension
+#  built for (almost) every target platform, (excludes TPU and Neuron)
+
+set(VLLM_EXT_SRC
+  "csrc/core/torch_bindings.cpp")
+
+define_gpu_extension_target(
+  _core_C
+  DESTINATION vllm
+  LANGUAGE CXX
+  SOURCES ${VLLM_EXT_SRC}
+  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
+  USE_SABI 3
+  WITH_SOABI)
+
+add_dependencies(default _core_C)
+
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@@ -74,7 +107,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
     if (VLLM_TARGET_DEVICE STREQUAL "cpu")
         include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
     else()
-        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
+        return()
     endif()
     return()
 endif()
@@ -132,7 +165,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 
 #
-# Define extension targets
+# Define other extension targets
 #
 
 #
@@ -228,21 +261,6 @@ define_gpu_extension_target(
 
 
 
-#
-# Add the `default` target which detects which extensions should be
-# built based on platform/architecture.  This is the same logic that
-# setup.py uses to select which extensions should be built and should
-# be kept in sync.
-#
-# The `default` target makes direct use of cmake easier since knowledge
-# of which extensions are supported has been factored in, e.g.
-#
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
-# cmake --build . --target default
-#
-add_custom_target(default)
-
 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling C extension.")
   add_dependencies(default _C)
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 7c62dd845aa99..c84dea419e58a 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -13,6 +13,9 @@ COPY requirements-common.txt /workspace/vllm/
 COPY requirements-openvino.txt /workspace/vllm/
 
 COPY vllm/ /workspace/vllm/vllm
+COPY csrc/core /workspace/vllm/csrc/core
+COPY cmake/utils.cmake /workspace/vllm/cmake/
+COPY CMakeLists.txt /workspace/vllm/
 COPY setup.py /workspace/vllm/
 
 # install build requirements
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 684985b81f690..536c133bb3341 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -7,16 +7,17 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS)
+    MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     MarlinWorkspace, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
     marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, quantize_weights, sort_weights)
+    gptq_pack, gptq_quantize_weights, sort_weights)
+from vllm.scalar_type import ScalarType
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@@ -27,13 +28,14 @@
 
 
 def bench_run(results: List[benchmark.Measurement], model: str,
-              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
-              size_m: int, size_k: int, size_n: int):
+              act_order: bool, is_k_full: bool, quant_type: ScalarType,
+              group_size: int, size_m: int, size_k: int, size_n: int):
     label = "Quant Matmul"
 
-    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
-                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
-                                         group_size, size_m, size_k, size_n))
+    sub_label = ("{}, act={} k_full={}, q={}, g={}, "
+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full,
+                                         str(quant_type), group_size, size_m,
+                                         size_k, size_n))
 
     print(f"Testing: {sub_label}")
 
@@ -50,18 +52,18 @@ def bench_run(results: List[benchmark.Measurement], model: str,
         marlin_g_idx,
         marlin_sort_indices,
         marlin_rand_perm,
-    ) = marlin_quantize(b, num_bits, group_size, act_order)
+    ) = marlin_quantize(b, quant_type, group_size, act_order)
 
     # Marlin_24 quant
     (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
+     marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
 
     marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
 
     # GPTQ quant
     (w_ref, q_w, s, g_idx,
-     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
-    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+     rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
+    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # For act_order, sort the "weights" and "g_idx"
     # so that group ids are increasing
@@ -75,10 +77,11 @@ def bench_run(results: List[benchmark.Measurement], model: str,
 
     marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
                                           GPTQ_MARLIN_24_MAX_PARALLEL)
+    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
 
     globals = {
         # Gen params
-        "num_bits": num_bits,
+        "quant_type": quant_type,
         "group_size": group_size,
         "size_m": size_m,
         "size_n": size_n,
@@ -128,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -138,19 +141,19 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
             description="gptq_marlin_gemm_fp32",
         ).blocked_autorange(min_run_time=min_run_time))
 
-    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+    if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
             and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
         results.append(
             benchmark.Timer(
                 stmt=
-                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
                 globals=globals,
                 label=label,
                 sub_label=sub_label,
@@ -160,7 +163,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -196,9 +199,10 @@ def main(args):
                            ) > 0 and is_k_full not in args.limit_k_full:
                         continue
 
-                    for num_bits in MARLIN_SUPPORTED_NUM_BITS:
-                        if len(args.limit_num_bits
-                               ) > 0 and num_bits not in args.limit_num_bits:
+                    for quant_type in query_marlin_supported_quant_types(
+                            False):
+                        if len(args.limit_num_bits) > 0 and \
+                            quant_type.size_bits not in args.limit_num_bits:
                             continue
 
                         for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
@@ -215,8 +219,8 @@ def main(args):
 
                             for size_m in args.batch_sizes:
                                 bench_run(results, model, act_order, is_k_full,
-                                          num_bits, group_size, size_m, size_k,
-                                          size_n)
+                                          quant_type, group_size, size_m,
+                                          size_k, size_n)
 
     compare = benchmark.Compare(results)
     compare.print()
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 118f9b28e0ae3..3ba3a2b6a93cd 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -113,6 +113,5 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
diff --git a/csrc/registration.h b/csrc/core/registration.h
similarity index 100%
rename from csrc/registration.h
rename to csrc/core/registration.h
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
new file mode 100644
index 0000000000000..9f78402eee2a7
--- /dev/null
+++ b/csrc/core/scalar_type.hpp
@@ -0,0 +1,382 @@
+#pragma once
+
+#include <torch/custom_class.h>
+
+namespace vllm {
+
+//
+//  ScalarType can represent a wide range of floating point and integer types,
+//  in particular it can be used to represent sub-byte data types (something
+//  that torch.dtype currently does not support).
+//
+//  ScalarTypeTorch is a subclass of ScalarType that is compatible with
+//  TORCH_LIBRARY, making it accessible from Python as well meaning this class
+//  can be used as a argument for custom operators, helping to simplify these
+//  interfaces.
+//
+//  The type definitions on the Python side can be found in: vllm/_core_ext.pyi
+//  these type definitions should be kept up to date with any Python API changes
+//  here.
+//
+class ScalarType {
+ public:
+  enum NanRepr : int64_t {
+    NAN_NONE = 0,                // nans are not supported
+    NAN_IEEE_754 = 1,            // nans are: exp all 1s, mantissa not all 0s
+    NAN_EXTD_RANGE_MAX_MIN = 2,  // nans are: exp all 1s, mantissa all 1s
+
+    NAN_REPR_ID_MAX
+  };
+
+  constexpr ScalarType(bool signed_, int64_t exponent, int64_t mantissa,
+                       int64_t bias, bool finite_values_only = false,
+                       NanRepr nan_repr = NAN_IEEE_754)
+      : exponent(exponent),
+        mantissa(mantissa),
+        bias(bias),
+        signed_(signed_),
+        finite_values_only(finite_values_only),
+        nan_repr(nan_repr){};
+
+  static constexpr ScalarType int_(int64_t size_bits, int64_t bias = 0) {
+    return ScalarType(true, 0, size_bits - 1, bias);
+  }
+
+  static constexpr ScalarType uint(int64_t size_bits, int64_t bias = 0) {
+    return ScalarType(false, 0, size_bits, bias);
+  }
+
+  // IEEE 754 compliant floating point type
+  static constexpr ScalarType float_IEEE754(int64_t exponent,
+                                            int64_t mantissa) {
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    return ScalarType(true, exponent, mantissa, 0, false, NAN_IEEE_754);
+  }
+
+  // IEEE 754 non-compliant floating point type
+  static constexpr ScalarType float_(int64_t exponent, int64_t mantissa,
+                                     bool finite_values_only,
+                                     NanRepr nan_repr) {
+    TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    TORCH_CHECK(nan_repr != NAN_IEEE_754,
+                "use `float_IEEE754` constructor for floating point types that "
+                "follow IEEE 754 conventions");
+    return ScalarType(true, exponent, mantissa, 0, finite_values_only,
+                      nan_repr);
+  }
+
+  int64_t const exponent;  // size of the exponent field (0 for integer types)
+  int64_t const mantissa;  // size of the mantissa field (size of the integer
+                           // excluding the sign bit for integer types)
+  int64_t const bias;      // stored values equal value + bias,
+                           // used for quantized type
+  bool const signed_;  // flag if the type supports negative numbers (i.e. has a
+                       // sign bit)
+
+  // Extra Floating point info
+  bool const finite_values_only;  // i.e. no +/-inf if true
+  NanRepr const nan_repr;         // how NaNs are represented
+                                  // (not applicable for integer types)
+
+  int64_t size_bits() const { return mantissa + exponent + is_signed(); }
+  bool is_signed() const { return signed_; }
+  bool is_integer() const { return exponent == 0; }
+  bool is_floating_point() const { return exponent > 0; }
+  bool is_ieee_754() const {
+    return is_floating_point() && finite_values_only == false &&
+           nan_repr == NAN_IEEE_754;
+  }
+  bool has_nans() const { return is_floating_point() && nan_repr != NAN_NONE; }
+  bool has_infs() const {
+    return is_floating_point() && finite_values_only == false;
+  }
+  bool has_bias() const { return bias != 0; }
+
+ private:
+  double _floating_point_max() const {
+    TORCH_CHECK(mantissa <= 52 && exponent <= 11,
+                "Cannot represent max/min as a double for type ", str());
+
+    uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
+      max_mantissa -= 1;
+    }
+
+    uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
+      TORCH_CHECK(exponent < 11,
+                  "Cannot represent max/min as a double for type ", str());
+      max_exponent += 1;
+    }
+
+    // adjust the exponent to match that of a double
+    //  for now we assume the exponent bias is the standard 2^(e-1) -1, (where e
+    //  is the exponent bits), there is some precedent for non-standard biases,
+    //  example `float8_e4m3b11fnuz` here: https://github.com/jax-ml/ml_dtypes
+    //  but to avoid premature over complication we are just assuming the
+    //  standard exponent bias until there is a need to support non-standard
+    //  biases
+    uint64_t exponent_bias = (uint64_t(1) << (exponent - 1)) - 1;
+    uint64_t exponent_bias_double = (uint64_t(1) << 10) - 1;  // double e = 11
+
+    uint64_t max_exponent_double =
+        max_exponent - exponent_bias + exponent_bias_double;
+
+    // shift the mantissa into the position for a double and
+    // the exponent
+    uint64_t double_raw =
+        (max_mantissa << (52 - mantissa)) | (max_exponent_double << 52);
+
+    return *reinterpret_cast<double*>(&double_raw);
+  }
+
+  std::variant<int64_t, double> _raw_max() const {
+    if (is_floating_point()) {
+      return {_floating_point_max()};
+    } else {
+      TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
+                  "Cannot represent max as a int64_t");
+      return {(int64_t(1) << mantissa) - 1};
+    }
+  }
+
+  std::variant<int64_t, double> _raw_min() const {
+    if (is_floating_point()) {
+      TORCH_CHECK(is_signed(),
+                  "We currently assume all floating point types are signed");
+      constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
+
+      double max = _floating_point_max();
+      uint64_t max_raw = *reinterpret_cast<uint64_t*>(&max);
+      uint64_t min_raw = max_raw | sign_bit_double;
+      return {*reinterpret_cast<double*>(&min_raw)};
+    } else {
+      TORCH_CHECK(!is_signed() || size_bits() <= 64,
+                  "Cannot represent min as a int64_t");
+      if (is_signed()) {
+        // set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
+        // then perform an arithmetic shift right to set all the bits above
+        // (size_bits() - 1) to 1
+        return {INT64_MIN >> (64 - size_bits())};
+      } else {
+        return {int64_t(0)};
+      }
+    }
+  }
+
+ public:
+  // Max representable value for this scalar type.
+  // (accounting for bias if there is one)
+  std::variant<int64_t, double> max() const {
+    return std::visit(
+        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
+        _raw_max());
+  }
+
+  // Min representable value for this scalar type.
+  // (accounting for bias if there is one)
+  std::variant<int64_t, double> min() const {
+    return std::visit(
+        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
+        _raw_min());
+  }
+
+  std::string str() const {
+    /* naming generally follows: https://github.com/jax-ml/ml_dtypes
+     * for floating point types (leading f) the scheme is:
+     *  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+     *  flags:
+     *  - no-flags: means it follows IEEE 754 conventions
+     *  - f: means finite values only (no infinities)
+     *  - n: means nans are supported (non-standard encoding)
+     * for integer types the scheme is:
+     *  `[u]int<size_bits>[b<bias>]`
+     *  - if bias is not present it means its zero
+     */
+    if (is_floating_point()) {
+      auto ret = "float" + std::to_string(size_bits()) + "_e" +
+                 std::to_string(exponent) + "m" + std::to_string(mantissa);
+      if (!is_ieee_754()) {
+        if (finite_values_only) {
+          ret += "f";
+        }
+        if (nan_repr != NAN_NONE) {
+          ret += "n";
+        }
+      }
+      return ret;
+    } else {
+      auto ret = ((is_signed()) ? "int" : "uint") + std::to_string(size_bits());
+      if (has_bias()) {
+        ret += "b" + std::to_string(bias);
+      }
+      return ret;
+    }
+  }
+
+  bool operator==(ScalarType const& other) const {
+    return mantissa == other.mantissa && exponent == other.exponent &&
+           bias == other.bias && signed_ == other.signed_ &&
+           finite_values_only == other.finite_values_only &&
+           nan_repr == other.nan_repr;
+  }
+};
+
+// Create a TORCH_LIBRARY compatible version of ScalarType (i.e. inherit from
+//  torch::CustomClassHolder), we use multiple inheritance here since we cannot
+//  have ScalarType inherit from torch::CustomClassHolder and have a constexpr
+//  constructor at the same time (torch::CustomClassHolder does not have a
+//  constexpr destructor)
+class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
+ public:
+  ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias,
+                  bool _signed)
+      : ScalarType(exponent, mantissa, bias, _signed){};
+
+  ScalarTypeTorch(ScalarType type) : ScalarType(type){};
+
+  using Base = ScalarType;
+  using Self = ScalarTypeTorch;
+  using SelfPtr = c10::intrusive_ptr<Self>;
+
+  static SelfPtr int_(int64_t size_bits, c10::optional<int64_t> bias) {
+    return c10::make_intrusive<Self>(
+        ScalarType::int_(size_bits, bias.value_or(0)));
+  }
+
+  static SelfPtr uint(int64_t size_bits, c10::optional<int64_t> bias) {
+    return c10::make_intrusive<Self>(
+        ScalarType::uint(size_bits, bias.value_or(0)));
+  }
+
+  static SelfPtr float_IEEE754(int64_t exponent, int64_t mantissa) {
+    return c10::make_intrusive<Self>(
+        ScalarType::float_IEEE754(exponent, mantissa));
+  }
+
+  static SelfPtr float_(int64_t exponent, int64_t mantissa,
+                        bool finite_values_only, int64_t nan_repr) {
+    return c10::make_intrusive<Self>(ScalarType::float_(
+        exponent, mantissa, finite_values_only, NanRepr(nan_repr)));
+  }
+
+  template <typename T>
+  static void bind_readonly_property(torch::class_<Self>& cls,
+                                     std::string const& name, T Base::*field) {
+    auto getter_func = [field = std::move(field)](SelfPtr const& self) {
+      if constexpr (std::is_member_function_pointer_v<decltype(field)>) {
+        return (self.get()->*field)();
+      } else {
+        return self.get()->*field;
+      }
+    };
+
+    cls.def_property(name, getter_func);
+  }
+
+  template <typename MemberFunc, typename Cls>
+  static void bind_function(torch::class_<Self>& cls, const std::string& name,
+                            MemberFunc Cls::*member) {
+    cls.def(name, [member = std::move(member)](SelfPtr const& self) {
+      return (self.get()->*member)();
+    });
+  }
+
+  template <typename Func>
+  static void bind_function(torch::class_<Self>& cls, const std::string& name,
+                            Func func) {
+    cls.def(name, func);
+  }
+
+  template <typename Func>
+  static void bind_static_function(torch::class_<Self>& cls,
+                                   const std::string& name, Func func) {
+    cls.def_static(name, func);
+  }
+
+  static void bind_class(torch::Library& lib) {
+    auto cls = lib.class_<ScalarTypeTorch>("ScalarType")
+                   .def(torch::init<int64_t, int64_t, int64_t, bool>());
+
+    // Bind Properties
+    bind_readonly_property(cls, "mantissa", &Base::mantissa);
+    bind_readonly_property(cls, "exponent", &Base::exponent);
+    bind_readonly_property(cls, "bias", &Base::bias);
+    bind_readonly_property(cls, "signed", &Base::is_signed);
+    bind_readonly_property(cls, "size_bits", &Base::size_bits);
+
+    // Bind member functions
+    bind_function(cls, "is_signed", &Base::is_signed);
+    bind_function(cls, "is_integer", &Base::is_integer);
+    bind_function(cls, "is_floating_point", &Base::is_floating_point);
+    bind_function(cls, "is_ieee_754", &Base::is_ieee_754);
+    bind_function(cls, "has_nans", &Base::has_nans);
+    bind_function(cls, "has_infs", &Base::has_infs);
+    bind_function(cls, "has_bias", &Base::has_bias);
+
+    bind_function(cls, "max", [](SelfPtr const& self) {
+      return std::visit([](auto arg) { return c10::IValue(arg); },
+                        self.get()->max());
+    });
+    bind_function(cls, "min", [](SelfPtr const& self) {
+      return std::visit([](auto arg) { return c10::IValue(arg); },
+                        self.get()->min());
+    });
+
+    bind_function(cls, "__str__", &Base::str);
+    bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) {
+      return *self == *other;
+    });
+    bind_function(cls, "__repr__", [](SelfPtr const& self) {
+      return "ScalarType." + self.get()->str();
+    });
+
+    // Bind static functions (convenience constructors)
+    bind_static_function(cls, "int_", &ScalarTypeTorch::int_);
+    bind_static_function(cls, "uint", &ScalarTypeTorch::uint);
+    bind_static_function(cls, "float_IEEE754", &ScalarTypeTorch::float_IEEE754);
+    bind_static_function(cls, "float_", &ScalarTypeTorch::float_);
+  }
+};
+
+using ScalarTypeTorchPtr = c10::intrusive_ptr<ScalarTypeTorch>;
+
+// "rust style" names generally following:
+//   https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
+static inline constexpr auto kS4 = ScalarType::int_(4);
+static inline constexpr auto kU4 = ScalarType::uint(4);
+static inline constexpr auto kU4B8 = ScalarType::uint(4, 8);
+static inline constexpr auto kS8 = ScalarType::int_(8);
+static inline constexpr auto kU8 = ScalarType::uint(8);
+static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
+
+static inline constexpr auto kFE3M2f =
+    ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
+static inline constexpr auto kFE4M3fn =
+    ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
+static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
+static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
+static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
+
+// Fixed width style names, generally following:
+//  https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L47-L57
+static inline constexpr auto kInt4 = kS4;
+static inline constexpr auto kUint4 = kU4;
+static inline constexpr auto kUint4b8 = kU4B8;
+static inline constexpr auto kInt8 = kS8;
+static inline constexpr auto kUint8 = kU8;
+static inline constexpr auto kUint8b128 = kU8B128;
+
+static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
+static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
+static inline constexpr auto kFloat8_e5m2 = kFE5M2;
+static inline constexpr auto kFloat16_e8m7 = kFE8M7;
+static inline constexpr auto kFloat16_e5m10 = kFE5M10;
+
+// colloquial names
+static inline constexpr auto kHalf = kFE5M10;
+static inline constexpr auto kFloat16 = kHalf;
+static inline constexpr auto kBFloat16 = kFE8M7;
+
+};  // namespace vllm
diff --git a/csrc/core/torch_bindings.cpp b/csrc/core/torch_bindings.cpp
new file mode 100644
index 0000000000000..f60254189a2f7
--- /dev/null
+++ b/csrc/core/torch_bindings.cpp
@@ -0,0 +1,16 @@
+#include <torch/library.h>
+
+#include "scalar_type.hpp"
+#include "registration.h"
+
+// Note the CORE exstension will be built for (almost) all hardware targets so
+// new additions must account for this. (currently not built for TPU and Neuron)
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) {
+  // ScalarType, a custom class for representing data types that supports
+  // quantized types, declared here so it can be used when creating interfaces
+  // for custom ops.
+  vllm::ScalarTypeTorch::bind_class(lib);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 7d549e271a30d..cf7d977da7c1c 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -1,6 +1,6 @@
 #include "cache.h"
 #include "ops.h"
-#include "registration.h"
+#include "core/registration.h"
 
 #include <torch/library.h>
 
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 243752b9a9e8c..86e42af44df15 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -1,4 +1,4 @@
-#include "registration.h"
+#include "core/registration.h"
 #include "moe_ops.h"
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
diff --git a/csrc/ops.h b/csrc/ops.h
index f274a7e647b95..3bd4a9eda5ee3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -3,6 +3,8 @@
 #include <optional>
 #include <torch/library.h>
 
+#include "core/scalar_type.hpp"
+
 void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
@@ -84,14 +86,16 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   torch::Tensor& b_meta,
                                   torch::Tensor& b_scales,
-                                  torch::Tensor& workspace, int64_t num_bits,
+                                  torch::Tensor& workspace,
+                                  vllm::ScalarTypeTorchPtr const& b_q_type,
                                   int64_t size_m, int64_t size_n,
                                   int64_t size_k);
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& b_scales, torch::Tensor& b_zeros,
                                torch::Tensor& g_idx, torch::Tensor& perm,
-                               torch::Tensor& workspace, int64_t num_bits,
+                               torch::Tensor& workspace,
+                               vllm::ScalarTypeTorchPtr const& b_q_type,
                                int64_t size_m, int64_t size_n, int64_t size_k,
                                bool is_k_full, bool has_zp,
                                bool use_fp32_reduce);
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 26cc248e6ac5d..edf19365c8098 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -21,6 +21,7 @@
 
 #include "marlin.cuh"
 #include "marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
   static_assert(std::is_same<scalar_t, half>::value ||          \
@@ -71,14 +72,15 @@ __global__ void Marlin(
     bool use_fp32_reduce  // whether to use fp32 global reduce
 ) {}
 
-}  // namespace gptq_marlin
+}  // namespace marlin
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& b_scales, torch::Tensor& b_zeros,
                                torch::Tensor& g_idx, torch::Tensor& perm,
-                               torch::Tensor& workspace, int64_t num_bits,
+                               torch::Tensor& workspace,
+                               vllm::ScalarTypeTorchPtr const& b_q_type,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full) {
+                               bool is_k_full, bool has_zp) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
   return torch::empty({1, 1});
@@ -1963,18 +1965,29 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
     __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
 
 template <typename scalar_t>
-void marlin_mm_f16i4(const void* A, const void* B, void* C, void* C_tmp,
-                     void* s, void* zp, void* g_idx, void* perm, void* a_tmp,
-                     int prob_m, int prob_n, int prob_k, void* workspace,
-                     int num_bits, bool has_act_order, bool is_k_full,
-                     bool has_zp, int num_groups, int group_size, int dev,
-                     cudaStream_t stream, int thread_k, int thread_n, int sms,
-                     int max_par, bool use_fp32_reduce) {
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
+               void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
+               int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& q_type, bool has_act_order,
+               bool is_k_full, bool has_zp, int num_groups, int group_size,
+               int dev, cudaStream_t stream, int thread_k, int thread_n,
+               int sms, int max_par, bool use_fp32_reduce) {
+  if (has_zp) {
+    TORCH_CHECK(
+        q_type == vllm::kU4 || q_type == vllm::kU8,
+        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
+  } else {
+    TORCH_CHECK(
+        q_type == vllm::kU4B8 || q_type == vllm::kU8B128,
+        "q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        q_type.str());
+  }
+
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
+  // TODO: remove alias when we start supporting other 8bit types
+  int num_bits = q_type.size_bits();
   int tot_m = prob_m;
   int tot_m_blocks = div_ceil(tot_m, 16);
   int pad = 16 * tot_m_blocks - tot_m;
@@ -2126,19 +2139,28 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* C_tmp,
   }
 }
 
-}  // namespace gptq_marlin
+}  // namespace marlin
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& b_scales, torch::Tensor& b_zeros,
                                torch::Tensor& g_idx, torch::Tensor& perm,
-                               torch::Tensor& workspace, int64_t num_bits,
+                               torch::Tensor& workspace,
+                               vllm::ScalarTypeTorchPtr const& b_q_type,
                                int64_t size_m, int64_t size_n, int64_t size_k,
                                bool is_k_full, bool has_zp,
                                bool use_fp32_reduce) {
-  // Verify num_bits
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
-  int pack_factor = 32 / num_bits;
+  if (has_zp) {
+    TORCH_CHECK(*b_q_type == vllm::kU4 || *b_q_type == vllm::kU8,
+                "b_q_type must be u4 or u8 when has_zp = True. Got = ",
+                b_q_type->str());
+  } else {
+    TORCH_CHECK(
+        *b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
+        "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        b_q_type->str());
+  }
+
+  int pack_factor = 32 / b_q_type->size_bits();
 
   // Verify A
   TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
@@ -2265,21 +2287,21 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
 
   int dev = a.get_device();
   if (a.scalar_type() == at::ScalarType::Half) {
-    marlin::marlin_mm_f16i4<half>(
+    marlin::marlin_mm<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
         c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
         b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
         a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
-        workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
+        workspace.data_ptr(), *b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
         thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    marlin::marlin_mm_f16i4<nv_bfloat16>(
+    marlin::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
         c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
         b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
         perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
-        workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
+        workspace.data_ptr(), *b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
         thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
   } else {
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index 3c50f1786bc68..93445a386593b 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -27,6 +27,7 @@
 #include <iostream>
 
 #include "common/base.h"
+#include "core/scalar_type.hpp"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
@@ -86,7 +87,8 @@ __global__ void Marlin_24(
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   torch::Tensor& b_meta,
                                   torch::Tensor& b_scales,
-                                  torch::Tensor& workspace, int64_t num_bits,
+                                  torch::Tensor& workspace,
+                                  vllm::ScalarTypeTorchPtr const& b_q_type,
                                   int64_t size_m, int64_t size_n,
                                   int64_t size_k) {
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -1025,13 +1027,14 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   torch::Tensor& b_meta,
                                   torch::Tensor& b_scales,
-                                  torch::Tensor& workspace, int64_t num_bits,
+                                  torch::Tensor& workspace,
+                                  vllm::ScalarTypeTorchPtr const& b_q_type,
                                   int64_t size_m, int64_t size_n,
                                   int64_t size_k) {
   // Verify num_bits
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
-  int pack_factor = 32 / num_bits;
+  TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
+              "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type->str());
+  int pack_factor = 32 / b_q_type->size_bits();
 
   // Verify M
   TORCH_CHECK(size_m == a.size(0),
@@ -1126,8 +1129,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   marlin_24::marlin_cuda_2_4(
       a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(),
       b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(),
-      num_bits, groupsize, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
-      thread_m, sms, max_par);
+      b_q_type->size_bits(), groupsize, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_m, sms, max_par);
 
   return c;
 }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index bf8cefa8d4713..7c0d617fc8b3b 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -1,7 +1,7 @@
 #include "cache.h"
 #include "cuda_utils.h"
 #include "ops.h"
-#include "registration.h"
+#include "core/registration.h"
 
 #include <torch/library.h>
 
diff --git a/setup.py b/setup.py
index 91307e8a94062..b146299f8269d 100644
--- a/setup.py
+++ b/setup.py
@@ -271,6 +271,10 @@ def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
 
+def _build_core_ext() -> bool:
+    return not _is_neuron() and not _is_tpu()
+
+
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -433,6 +437,9 @@ def _read_requirements(filename: str) -> List[str]:
 
 ext_modules = []
 
+if _build_core_ext():
+    ext_modules.append(CMakeExtension(name="vllm._core_C"))
+
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 
@@ -477,7 +484,7 @@ def _read_requirements(filename: str) -> List[str]:
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
     },
-    cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
+    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
     package_data=package_data,
     entry_points={
         "console_scripts": [
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 03acbf7968ff1..0b7ed26a39e1e 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -1,8 +1,6 @@
 import pytest
 import torch
 
-# ruff: noqa: F401
-import vllm._C
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from vllm._custom_ops import scaled_int8_quant
 
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index a9e34ac8a7aa8..2f58ffda21408 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -9,14 +9,14 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 from vllm.model_executor.layers.quantization.qqq import (
     MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
     MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS,
-    marlin_make_empty_g_idx, marlin_permute_scales)
+    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
+    marlin_permute_scales, query_marlin_supported_quant_types)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     pack_fp8_to_int32)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -27,8 +27,7 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
     marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    awq_pack, gptq_pack, quantize_weights, quantize_weights_with_zp,
-    sort_weights)
+    awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
@@ -65,12 +64,13 @@ def rand_data(shape, dtype=torch.float16):
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(False))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
-                            mnk_factors):
+def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
+                            act_order, mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_m = m_factor
@@ -95,11 +95,11 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
     b_weight = rand_data((size_k, size_n))
 
     # Quantize (and apply act_order if provided)
-    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(b_weight, num_bits,
-                                                       group_size, act_order)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        b_weight, quant_type, group_size, act_order)
 
     # Pack to GPTQ format
-    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
@@ -108,8 +108,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                  weight_perm)
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
@@ -117,7 +118,7 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
         sort_indices,
         size_k,
         size_n,
-        num_bits,
+        quant_type.size_bits,
     )
     torch.cuda.synchronize()
 
@@ -128,10 +129,11 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(False))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
+def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
                            mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
@@ -150,22 +152,25 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
     b_weight = rand_data((size_k, size_n))
 
     # Quantize
-    w_ref, q_w, s, zp = quantize_weights_with_zp(b_weight, num_bits,
-                                                 group_size)
+    w_ref, q_w, s, zp = quantize_weights(b_weight,
+                                         quant_type,
+                                         group_size,
+                                         zero_points=True)
 
     # Pack to AWQ format
-    q_w_awq = awq_pack(q_w, num_bits, size_k, size_n)
+    q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                  weight_perm)
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.awq_marlin_repack(
         q_w_awq,
         size_k,
         size_n,
-        num_bits,
+        quant_type.size_bits,
     )
     torch.cuda.synchronize()
 
@@ -176,7 +181,8 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(False))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
@@ -185,7 +191,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
 def test_gptq_marlin_gemm(
     k_chunk,
     n_chunk,
-    num_bits,
+    quant_type,
     group_size,
     mnk_factors,
     act_order,
@@ -211,7 +217,7 @@ def test_gptq_marlin_gemm(
     b_weight = rand_data((size_k, size_n))
 
     w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-        b_weight, num_bits, group_size, act_order)
+        b_weight, quant_type, group_size, act_order)
 
     marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
 
@@ -226,7 +232,7 @@ def test_gptq_marlin_gemm(
         g_idx,
         sort_indices,
         workspace.scratch,
-        num_bits,
+        quant_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
@@ -248,10 +254,10 @@ def test_gptq_marlin_gemm(
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type", GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 @pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
+def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
                              mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
@@ -266,7 +272,7 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
     b_weight = rand_data((size_k, size_n))
 
     (w_24_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b_weight, num_bits, group_size)
+     marlin_24_s) = marlin_24_quantize(b_weight, quant_type, group_size)
 
     workspace_24 = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
                                    GPTQ_MARLIN_24_MAX_PARALLEL)
@@ -279,7 +285,7 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
         marlin_24_meta,
         marlin_24_s,
         workspace_24.scratch,
-        num_bits,
+        quant_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
@@ -371,14 +377,15 @@ def test_fp8_marlin_gemm(
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(True))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 def test_awq_marlin_gemm(
     k_chunk,
     n_chunk,
-    num_bits,
+    quant_type,
     group_size,
     mnk_factors,
     use_fp32_reduce,
@@ -396,7 +403,7 @@ def test_awq_marlin_gemm(
     b_weight = rand_data((size_k, size_n))
 
     w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
-        b_weight, num_bits, group_size)
+        b_weight, quant_type, group_size)
 
     g_idx = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
     sort_indices = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
@@ -414,7 +421,7 @@ def test_awq_marlin_gemm(
         g_idx,
         sort_indices,
         workspace.scratch,
-        num_bits,
+        quant_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
new file mode 100644
index 0000000000000..1201aaa92ea89
--- /dev/null
+++ b/tests/test_scalartype.py
@@ -0,0 +1,36 @@
+import pytest
+import torch
+
+from vllm.scalar_type import scalar_types
+
+
+@pytest.mark.parametrize("type_tuple", (
+    (-8, 7, scalar_types.int4),
+    (0, 15, scalar_types.uint4),
+    (-8, 7, scalar_types.uint4b8),
+    (-128, 127, scalar_types.uint8b128),
+    (-28., 28., scalar_types.float6_e3m2f),
+    (torch.int8, scalar_types.int8),
+    (torch.uint8, scalar_types.uint8),
+    (torch.float8_e5m2, scalar_types.float8_e5m2),
+    (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
+    (torch.bfloat16, scalar_types.float16_e8m7),
+    (torch.float16, scalar_types.float16_e5m10),
+),
+                         ids=lambda x: str(x))
+def test_scalar_type_min_max(type_tuple):
+    print(type_tuple)
+    if len(type_tuple) == 3:
+        min, max, t = type_tuple
+    else:
+        torch_type, t = type_tuple
+        if torch_type.is_floating_point:
+            min = torch.finfo(torch_type).min
+            max = torch.finfo(torch_type).max
+        else:
+            min = torch.iinfo(torch_type).min
+            max = torch.iinfo(torch_type).max
+
+    print(t, min, max, t.min(), t.max())
+    assert min == t.min()
+    assert max == t.max()
diff --git a/vllm/_core_ext.py b/vllm/_core_ext.py
new file mode 100644
index 0000000000000..e3b9fbb938915
--- /dev/null
+++ b/vllm/_core_ext.py
@@ -0,0 +1,177 @@
+import importlib.util
+from enum import Enum
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+core_C_available = importlib.util.find_spec('._core_C', 'vllm') is not None
+
+
+# Mirrors enum in `core/scalar_type.hpp`
+class NanRepr(Enum):
+    NONE = 0  # nans are not supported
+    IEEE_754 = 1  # nans are: Exp all 1s, mantissa not all 0s
+    EXTD_RANGE_MAX_MIN = 2  # nans are: Exp all 1s, mantissa all 1s
+
+
+if TYPE_CHECKING or not core_C_available:
+    # On platforms were we cannot use/build the C++ core extension (i.e. namely
+    # neuron and tpu), we define the mock ScalarType class here that partially
+    # mimics the C++ ScalarType class.
+    #
+    # We also use this provide type signatures to the Python LSP for the methods
+    # in the C++ ScalarType class. So these type signatures should be kept
+    # in sync with csrc/core/scalar_type.hpp
+
+    from dataclasses import dataclass
+
+    @dataclass(frozen=True)
+    class ScalarType:
+        """
+        ScalarType can represent a wide range of floating point and integer 
+        types, in particular it can be used to represent sub-byte data types 
+        (something that torch.dtype currently does not support). It is also 
+        capable of  representing types with a bias, i.e.:
+          `stored_value = value + bias`, 
+        this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias 
+        of 8). The implementation for this class can be found in 
+        csrc/core/scalar_type.hpp, these type signatures should be kept in sync 
+        with that file.
+        """
+
+        exponent: int
+        """
+        Number of bits in the exponent if this is a floating point type
+        (zero if this an integer type)
+        """
+
+        mantissa: int
+        """
+        Number of bits in the mantissa if this is a floating point type,
+        or the number bits representing an integer excluding the sign bit if 
+        this an integer type.
+        """
+
+        bias: int
+        """
+        bias used to encode the values in this scalar type 
+        (value = stored_value - bias, default 0) for example if we store the 
+        type as an unsigned integer with a bias of 128 then the value 0 will be 
+        stored as 128 and -1 will be stored as 127 and 1 will be stored as 129.
+        """
+
+        signed: bool
+        "If the type is signed (i.e. has a sign bit)"
+
+        _finite_values_only: bool = False
+        """
+        Private: if NANs are supported, used `has_infs()` instead.
+        """
+
+        nan_repr: int = NanRepr.IEEE_754.value
+        """
+        How NaNs are represent in this scalar type, returns NanRepr value. 
+        (not applicable for integer types)
+        """
+
+        @property
+        def size_bits(self):
+            return self.exponent + self.mantissa + int(self.signed)
+
+        def min(self) -> Union[int, float]:
+            """
+            Min representable value for this scalar type. 
+            (accounting for bias if there is one)
+            """
+            raise NotImplementedError
+
+        def max(self) -> Union[int, float]:
+            """
+            Max representable value for this scalar type. 
+            (accounting for bias if there is one)
+            """
+            raise NotImplementedError
+
+        def is_signed(self) -> bool:
+            """
+            If the type is signed (i.e. has a sign bit), same as `signed`
+            added for consistency with:
+            https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html
+            """
+            ...
+
+        def is_floating_point(self):
+            "If the type is a floating point type"
+            return self.exponent != 0
+
+        def is_integer(self):
+            "If the type is an integer type"
+            return self.exponent == 0
+
+        def has_bias(self):
+            "If the type has a non-zero bias"
+            return self.bias != 0
+
+        def has_infs(self):
+            "If the type is floating point and supports infinity"
+            return not self._finite_values_only
+
+        def has_nans(self):
+            return self.nan_repr != NanRepr.NONE.value
+
+        def is_ieee_754(self) -> bool:
+            """
+            If the type is a floating point type that follows IEEE 754 
+            conventions
+            """
+            return self.nan_repr == NanRepr.IEEE_754.value and \
+                not self._finite_values_only
+
+        def __str__(self) -> str:
+            raise NotImplementedError
+
+        def __repr__(self) -> str:
+            raise NotImplementedError
+
+        #
+        # Convenience Constructors
+        #
+
+        @classmethod
+        def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+            "Create a signed integer scalar type (size_bits includes sign-bit)."
+            return cls(size_bits - 1, size_bits, bias if bias else 0, True)
+
+        @classmethod
+        def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+            """Create a unsigned integer scalar type."""
+            return cls(size_bits, size_bits, bias if bias else 0, False)
+
+        @classmethod
+        def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType':
+            """
+            Create a standard floating point type 
+            (i.e. follows IEEE 754 conventions).
+            """
+            return cls(exponent, mantissa, 0, True)
+
+        @classmethod
+        def float_(cls, exponent: int, mantissa: int, finite_values_only: bool,
+                   nan_repr: int):
+            """
+            Create a non-standard floating point type 
+            (i.e. does not follow IEEE 754 conventions).
+            """
+            return cls(exponent, mantissa, 0, True, finite_values_only,
+                       nan_repr)
+
+elif core_C_available:
+    try:
+        import vllm._core_C  # noqa: F401
+    except ImportError as e:
+        logger.warning("Failed to import from vllm._core_C with %r", e)
+
+    ScalarType = torch.classes._core_C.ScalarType
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6cd77f75cae8d..ad7e5bd199339 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from vllm._core_ext import ScalarType
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -220,10 +221,10 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 # marlin_24
 def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                         b_meta: torch.Tensor, b_scales: torch.Tensor,
-                        workspace: torch.Tensor, num_bits: int, size_m: int,
-                        size_n: int, size_k: int) -> torch.Tensor:
+                        workspace: torch.Tensor, b_q_type: ScalarType,
+                        size_m: int, size_n: int, size_k: int) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
-                                            workspace, num_bits, size_m,
+                                            workspace, b_q_type, size_m,
                                             size_n, size_k)
 
 
@@ -279,14 +280,22 @@ def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
     return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
 
 
-def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
-                     b_scales: torch.Tensor, b_zeros: torch.Tensor,
-                     g_idx: torch.Tensor, perm: torch.Tensor,
-                     workspace: torch.Tensor, num_bits: int, size_m: int,
-                     size_n: int, size_k: int, is_k_full: bool, has_zp: bool,
-                     use_fp32_reduce: bool) -> torch.Tensor:
+def gptq_marlin_gemm(a: torch.Tensor,
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     b_zeros: torch.Tensor,
+                     g_idx: torch.Tensor,
+                     perm: torch.Tensor,
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool,
+                     has_zp: bool = False,
+                     use_fp32_reduce: bool = False) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
-                                         g_idx, perm, workspace, num_bits,
+                                         g_idx, perm, workspace, b_q_type,
                                          size_m, size_n, size_k, is_k_full,
                                          has_zp, use_fp32_reduce)
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 5ffbb8e854e87..2cc080608c7a9 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -10,11 +10,11 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    apply_awq_marlin_linear, awq_to_marlin_zero_points,
-    check_awq_marlin_supported, marlin_make_empty_g_idx, marlin_make_workspace,
-    marlin_permute_scales, replace_tensor, verify_awq_marlin_supported,
-    verify_marlin_supports_shape)
+    apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
+    replace_tensor, verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 
@@ -22,20 +22,31 @@
 class AWQMarlinConfig(QuantizationConfig):
     """Config class for AWQ Marlin"""
 
+    # num_bits -> type
+    TYPE_MAP = {
+        4: scalar_types.uint4,
+        8: scalar_types.uint8,
+    }
+
     def __init__(self, weight_bits: int, group_size: int, has_zp: bool,
                  lm_head_quantized: bool) -> None:
-        self.weight_bits = weight_bits
-        self.pack_factor = 32 // self.weight_bits  # packed into 32bits
+        self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.has_zp = has_zp
         self.lm_head_quantized = lm_head_quantized
 
-        verify_awq_marlin_supported(num_bits=self.weight_bits,
-                                    group_size=self.group_size,
-                                    has_zp=self.has_zp)
+        if weight_bits not in self.TYPE_MAP:
+            raise ValueError(f"Unsupported num_bits = {weight_bits}. "
+                             f"Supported num_bits = {self.TYPE_MAP.keys()}")
+
+        self.quant_type = self.TYPE_MAP[weight_bits]
+
+        verify_marlin_supported(self.quant_type,
+                                group_size=self.group_size,
+                                has_zp=self.has_zp)
 
     def __repr__(self) -> str:
-        return (f"AWQMarlinConfig(weight_bits={self.weight_bits}, "
+        return (f"AWQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
                 f"has_zp={self.has_zp}, "
                 f"lm_head_quantized={self.lm_head_quantized})")
@@ -110,11 +121,13 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         if (num_bits is None or group_size is None or has_zp is None):
             return False
 
-        return check_awq_marlin_supported(
-            num_bits=num_bits,
-            group_size=group_size,
-            has_zp=has_zp,
-            min_capability=cls.get_min_capability())
+        if num_bits not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits],
+                                      group_size=group_size,
+                                      has_zp=has_zp,
+                                      min_capability=cls.get_min_capability())
 
 
 class AWQMarlinLinearMethod(LinearMethodBase):
@@ -226,7 +239,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.qweight,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.quant_type.size_bits)
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from AWQ format to marlin format.
@@ -242,7 +255,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.qzeros,
             size_k=layer.num_groups,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.quant_type.size_bits)
         replace_tensor(layer, "qzeros", marlin_zp)
 
         # Not-used
@@ -263,7 +276,7 @@ def apply(
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.quant_config.weight_bits,
+            quant_type=self.quant_config.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index b8ffb22d7a89d..c1adfdb2980b6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -9,9 +9,13 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsW4A16Sparse24"]
-W4A16SPARSE24_SUPPORTED_BITS = [4]
+W4A16SPARSE24_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.uint4b8,
+}
+W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys())
 
 
 class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
@@ -22,9 +26,15 @@ def __init__(self,
                  group_size: Optional[int] = None):
         self.strategy = strategy
         self.group_size = group_size
-        self.num_bits = num_bits
         self.tile_size = 16
 
+        if num_bits not in W4A16SPARSE24_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}. "
+                f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}")
+
+        self.quant_type = W4A16SPARSE24_SUPPORTED_TYPES_MAP[num_bits]
+
         if self.strategy == "group" and self.group_size is None:
             raise ValueError(
                 "group_size must be given when using strategy group")
@@ -43,7 +53,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
-        pack_factor = 32 // self.num_bits
+        pack_factor = 32 // self.quant_type.size_bits
         output_size_per_partition = sum(output_partition_sizes)
 
         qweight = Parameter(
@@ -138,7 +148,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
         size_n = scales.shape[1]
 
         output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
-                                            workspace, self.num_bits, size_m,
+                                            workspace, self.quant_type, size_m,
                                             size_n, size_k)
 
         output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a41962ccd66d8..b8880f7ac136f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -8,12 +8,17 @@
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
-    marlin_permute_scales, replace_tensor, verify_gptq_marlin_supported,
+    marlin_permute_scales, replace_tensor, verify_marlin_supported,
     verify_marlin_supports_shape)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsWNA16"]
-WNA16_SUPPORTED_BITS = [4, 8]
+WNA16_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.uint4b8,
+    8: scalar_types.uint8b128,
+}
+WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
 
 class CompressedTensorsWNA16(CompressedTensorsScheme):
@@ -22,8 +27,8 @@ def __init__(self,
                  strategy: str,
                  num_bits: int,
                  group_size: Optional[int] = None):
-        self.num_bits = num_bits
-        self.pack_factor = 32 // self.num_bits
+
+        self.pack_factor = 32 // num_bits
         self.strategy = strategy
 
         self.group_size: int
@@ -37,10 +42,16 @@ def __init__(self,
         else:
             self.group_size = group_size
 
+        if num_bits not in WNA16_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}. "
+                f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}")
+
+        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[num_bits]
+
         # Verify supported on platform.
-        verify_gptq_marlin_supported(num_bits=self.num_bits,
-                                     group_size=self.group_size,
-                                     is_sym=True)
+        verify_marlin_supported(quant_type=self.quant_type,
+                                group_size=self.group_size)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -150,7 +161,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.num_bits)
+            num_bits=self.quant_type.size_bits)
         replace_tensor(layer, "weight_packed", marlin_qweight)
 
         # Permute scales from compressed-tensors format to marlin format.
@@ -172,7 +183,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.num_bits,
+            wtype=self.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=True,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index bdcc9c3b4f0c5..4a11b14971076 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -10,11 +10,12 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    apply_gptq_marlin_linear, check_gptq_marlin_supported, marlin_is_k_full,
+    apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full,
     marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
     marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
-    verify_gptq_marlin_supported, verify_marlin_supports_shape)
+    verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 
@@ -22,6 +23,12 @@
 class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
     def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
                  is_sym: bool, lm_head_quantized: bool) -> None:
         if desc_act and group_size == -1:
@@ -29,20 +36,23 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
             # (since we have only one group per output channel)
             desc_act = False
 
-        self.weight_bits = weight_bits
-        self.pack_factor = 32 // self.weight_bits  # packed into int32
+        self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
-        self.is_sym = is_sym
         self.lm_head_quantized = lm_head_quantized
 
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={weight_bits}, sym={is_sym}")
+
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
         # Verify supported on platform.
-        verify_gptq_marlin_supported(num_bits=self.weight_bits,
-                                     group_size=self.group_size,
-                                     is_sym=self.is_sym)
+        verify_marlin_supported(quant_type=self.quant_type,
+                                group_size=self.group_size)
 
     def __repr__(self) -> str:
-        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
+        return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
                 f"lm_head_quantized={self.lm_head_quantized})")
@@ -122,11 +132,12 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
                 or desc_act is None):
             return False
 
-        return check_gptq_marlin_supported(
-            num_bits=num_bits,
-            group_size=group_size,
-            is_sym=sym,
-            min_capability=cls.get_min_capability())
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)],
+                                      group_size=group_size,
+                                      min_capability=cls.get_min_capability())
 
 
 class GPTQMarlinLinearMethod(LinearMethodBase):
@@ -293,7 +304,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.quant_type.size_bits)
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -319,7 +330,7 @@ def apply(
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.quant_config.weight_bits,
+            wtype=self.quant_config.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index e708c4da95af3..cafd100a2f40c 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -9,6 +9,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 
@@ -17,9 +18,10 @@
 GPTQ_MARLIN_24_MIN_THREAD_K = 128
 GPTQ_MARLIN_24_MAX_PARALLEL = 64
 
-GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
+GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [
+    scalar_types.uint4b8, scalar_types.uint8b128
+]
 GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
-GPTQ_MARLIN_24_SUPPORTED_SYM = [True]
 
 
 class GPTQMarlin24Config(QuantizationConfig):
@@ -31,14 +33,19 @@ def __init__(
         weight_bits: int,
         group_size: int,
     ) -> None:
-        self.weight_bits = weight_bits
+        quant_type = {
+            4: scalar_types.uint4b8,
+            8: scalar_types.uint8b128,
+        }.get(weight_bits)
+
         self.group_size = group_size
 
         # Verify
-        if self.weight_bits not in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS:
+        if quant_type is None or \
+            quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES:
             raise ValueError(
-                f"Marlin_24 does not support weight_bits = {self.weight_bits}. "
-                f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_NUM_BITS} "
+                f"Marlin_24 does not support quant_type = {quant_type}. "
+                f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES} "
                 "are supported.")
         if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
             raise ValueError(
@@ -46,8 +53,10 @@ def __init__(
                 f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} "
                 "are supported.")
 
+        self.quant_type = quant_type
+
         # 4 Bits packed into 32 bit datatype.
-        self.pack_factor = 32 // self.weight_bits
+        self.pack_factor = 32 // self.quant_type.size_bits
 
         # Tile size used by marlin kernels.
         self.tile_size = 16
@@ -66,8 +75,8 @@ def __init__(
         self.perm_len = 1024
 
     def __repr__(self) -> str:
-        return "Marlin24Config(weight_bits={}, group_size={})".format(
-            self.weight_bits, self.group_size)
+        return "Marlin24Config(quant_type={}, group_size={})".format(
+            self.quant_type, self.group_size)
 
     @classmethod
     def get_name(cls) -> str:
@@ -279,7 +288,7 @@ def apply(
 
         output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
                                             workspace,
-                                            self.quant_config.weight_bits,
+                                            self.quant_config.quant_type,
                                             size_m, size_n, size_k)
 
         output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index b789ca20cadb3..6e84d36219361 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -5,6 +5,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
 
 from .quant_utils import pack_cols, unpack_cols
 
@@ -13,7 +14,6 @@
 GPTQ_MARLIN_MIN_THREAD_K = 128
 GPTQ_MARLIN_MAX_PARALLEL = 16
 
-MARLIN_SUPPORTED_NUM_BITS = [4, 8]
 MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
 # In case there is a performance issue with Marlin, the variable below can be
@@ -22,76 +22,70 @@
 USE_FP32_REDUCE_DEFAULT = True
 
 
-def _check_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
-                            min_capability: Optional[int],
-                            has_zp: bool) -> Tuple[bool, Optional[str]]:
-    if min_capability is not None:
+# For binary size and compile time, we don't support the same types for with and
+#  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
+#  TODO: we may want to move this into the C++ so its closer to the actual impl
+def query_marlin_supported_quant_types(has_zp: bool,
+                                       min_capability: Optional[int] = None):
+    if min_capability is None:
         major, minor = current_platform.get_device_capability()
-        device_capability = major * 10 + minor
-        if device_capability < min_capability:
-            return (False, "Marlin does not support device_capability = {}"
-                    ", the min_capability required is {}".format(
-                        device_capability, min_capability))
-
-    if num_bits not in MARLIN_SUPPORTED_NUM_BITS:
-        return (False, "Marlin does not support weight_bits = {}. "
-                "Only weight_bits = {} are supported.".format(
-                    num_bits, MARLIN_SUPPORTED_NUM_BITS))
-
-    if group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
-        return (False, "Marlin does not support group_size = {}. Only "
-                "group_sizes = {} are supported.".format(
-                    group_size, MARLIN_SUPPORTED_GROUP_SIZES))
-
-    if not has_zp and not is_sym:
-        return (False,
-                "Marlin without zero_points must have symmetric quantization")
+        min_capability = major * 10 + minor
 
-    return True, None
+    if min_capability < 80:
+        return []
 
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4, scalar_types.uint8]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        # TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
+        #  to add `scalar_types.float8_e4m3fn` here
+        return [scalar_types.uint4b8, scalar_types.uint8b128]
 
-def check_gptq_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
-                                min_capability: int) -> bool:
-    cond, _ = _check_marlin_supported(num_bits,
-                                      group_size,
-                                      is_sym,
-                                      min_capability,
-                                      has_zp=False)
-    return cond
 
+def _check_marlin_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        min_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
 
-def check_awq_marlin_supported(num_bits: int, group_size: int, has_zp: bool,
-                               min_capability: int) -> bool:
-    cond, _ = _check_marlin_supported(num_bits,
-                                      group_size,
-                                      False,
-                                      min_capability,
-                                      has_zp=has_zp)
-    return cond
+    if min_capability is None:
+        major, minor = current_platform.get_device_capability()
+        min_capability = major * 10 + minor
 
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, min_capability)
 
-def verify_gptq_marlin_supported(num_bits: int, group_size: int,
-                                 is_sym: bool) -> None:
-    cond, err_msg = _check_marlin_supported(num_bits,
-                                            group_size,
-                                            is_sym,
-                                            min_capability=None,
-                                            has_zp=False)
-    if not cond:
-        assert err_msg is not None
-        raise ValueError("GPTQ" + err_msg)
+    if quant_type not in supported_types:
+        return (False, f"Marlin does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"min_capability = {min_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
+        return (False, f"Marlin does not support group_size = {group_size}. "
+                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+
+    return True, None
+
+
+def check_marlin_supported(quant_type: ScalarType,
+                           group_size: int,
+                           has_zp: bool = False,
+                           min_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
+                                      min_capability)
+    return cond
 
 
-def verify_awq_marlin_supported(num_bits: int, group_size: int,
-                                has_zp: bool) -> None:
-    cond, err_msg = _check_marlin_supported(num_bits,
-                                            group_size,
-                                            False,
-                                            min_capability=None,
-                                            has_zp=has_zp)
+def verify_marlin_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False) -> None:
+    cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
-        raise ValueError("AWQ" + err_msg)
+        raise ValueError(err_msg)
 
 
 def verify_marlin_supports_shape(output_size_per_partition: int,
@@ -245,7 +239,7 @@ def apply_gptq_marlin_linear(
         g_idx: torch.Tensor,
         g_idx_sort_indices: torch.Tensor,
         workspace: torch.Tensor,
-        num_bits: int,
+        wtype: ScalarType,
         output_size_per_partition: int,
         input_size_per_partition: int,
         is_k_full: bool,
@@ -261,7 +255,7 @@ def apply_gptq_marlin_linear(
                                   g_idx,
                                   g_idx_sort_indices,
                                   workspace,
-                                  num_bits,
+                                  wtype,
                                   size_m=reshaped_x.shape[0],
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
@@ -283,7 +277,7 @@ def apply_awq_marlin_linear(
         g_idx: torch.Tensor,
         g_idx_sort_indices: torch.Tensor,
         workspace: torch.Tensor,
-        num_bits: int,
+        quant_type: ScalarType,
         output_size_per_partition: int,
         input_size_per_partition: int,
         bias: Optional[torch.Tensor] = None,
@@ -298,7 +292,7 @@ def apply_awq_marlin_linear(
                                   g_idx,
                                   g_idx_sort_indices,
                                   workspace,
-                                  num_bits,
+                                  quant_type,
                                   size_m=reshaped_x.shape[0],
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 541d148c761fc..7d08ac6f87469 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -5,10 +5,12 @@
 import numpy as np
 import torch
 
+from vllm.scalar_type import ScalarType
+
 from .marlin_utils import (GPTQ_MARLIN_TILE, marlin_permute_scales,
                            marlin_zero_points)
-from .quant_utils import (get_pack_factor, quantize_weights,
-                          quantize_weights_with_zp, sort_weights)
+from .quant_utils import (get_pack_factor, gptq_quantize_weights,
+                          quantize_weights, sort_weights)
 
 
 class MarlinWorkspace:
@@ -90,9 +92,10 @@ def get_weight_perm(num_bits: int):
     return perm
 
 
-def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
+def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
                     act_order: bool):
     size_k, size_n = w.shape
+    num_bits = quant_type.size_bits
 
     # Normalize group_size
     if group_size == -1:
@@ -100,8 +103,8 @@ def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
     assert group_size <= size_k
 
     # Quantize (and apply act_order if provided)
-    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
-                                                       act_order)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        w, quant_type, group_size, act_order)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
@@ -122,7 +125,8 @@ def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
     return res_list
 
 
-def awq_marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int):
+def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType,
+                        group_size: int):
     size_k, size_n = w.shape
 
     # Normalize group_size
@@ -135,13 +139,18 @@ def awq_marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int):
     num_groups = size_k // group_size
 
     # Quantize with zp
-    w_ref, q_w, s, zp = quantize_weights_with_zp(w, num_bits, group_size)
+    w_ref, q_w, s, zp = quantize_weights(w,
+                                         quant_type,
+                                         group_size,
+                                         zero_points=True)
 
     # Reformat to marlin
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                weight_perm)
     marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
-    marlin_zp = marlin_zero_points(zp, num_groups, size_n, num_bits)
+    marlin_zp = marlin_zero_points(zp, num_groups, size_n,
+                                   quant_type.size_bits)
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 648c32249a571..17d09055b1eac 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -6,8 +6,10 @@
 import numpy
 import torch
 
+from vllm.scalar_type import ScalarType
+
 from .marlin_utils_test import marlin_weights
-from .quant_utils import quantize_weights
+from .quant_utils import gptq_quantize_weights
 
 
 # This is PyTorch implementation of main part of reorder_meta()
@@ -348,13 +350,11 @@ def check_24(w, num_rows_to_sample=50, _verbose=False):
     print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
 
 
-def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
+def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType):
     assert q_24.shape == (size_k, size_n)
 
-    # Remove zp to normalize over 0
-    max_q_val = (1 << num_bits) - 1
-    zp = (max_q_val + 1) // 2
-    q_24_no_zp = q_24 - zp
+    # Remove bias to normalize over 0
+    q_24_no_zp = q_24 - wtype.bias
 
     # Compress
     q_24_no_zp = q_24_no_zp.t().contiguous()
@@ -362,8 +362,8 @@ def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
         q_24_no_zp)
     q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
 
-    # Restore zp
-    q_24_comp = q_24_no_zp_comp + zp
+    # Restore bias
+    q_24_comp = q_24_no_zp_comp + wtype.bias
 
     # Resize meta to its actual shape (without moving any data)
     meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
@@ -427,7 +427,7 @@ def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
 
 def marlin_24_quantize(
     w: torch.Tensor,
-    num_bits: int,
+    quant_type: ScalarType,
     group_size: int,
 ):
     size_k, size_n = w.shape
@@ -441,20 +441,18 @@ def marlin_24_quantize(
     w_24, mask_24 = inject_24(w, size_k, size_n)
 
     # Quantize
-    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
-                                                             num_bits,
-                                                             group_size,
-                                                             act_order=False)
+    w_24_ref, q_w_24, s, g_idx, rand_perm = gptq_quantize_weights(
+        w_24, quant_type, group_size, act_order=False)
 
     # Compress quantized weight
     q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
-                                                     num_bits)
+                                                     quant_type)
     size_k_comp = size_k // 2
 
     # Reformat to marlin
-    weight_perm = get_weight_perm_24(num_bits)
+    weight_perm = get_weight_perm_24(quant_type.size_bits)
     marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
-                                        num_bits, weight_perm)
+                                        quant_type.size_bits, weight_perm)
     marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
 
     # Create result
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 7ade8bf664ccc..7f9081b257705 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -4,7 +4,11 @@
 import numpy
 import torch
 
-SUPPORTED_NUM_BITS = [4, 8]
+from vllm.model_executor.layers.quantization.qqq import (
+    MARLIN_QQQ_SUPPORTED_NUM_BITS)
+from vllm.scalar_type import ScalarType, scalar_types
+
+SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
 SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
 # Note: this is a hack. We should update each model to register the
@@ -45,7 +49,7 @@ def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
 
 
 def get_pack_factor(num_bits):
-    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
     return 32 // num_bits
 
 
@@ -74,24 +78,23 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
     )
 
 
-def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
-                     act_order: bool):
+def quantize_weights(w: torch.Tensor,
+                     quant_type: ScalarType,
+                     group_size: int,
+                     zero_points: bool = False):
+    assert quant_type.is_integer(), \
+        "Floating point quantization may work but has not been tested"
+
     orig_device = w.device
+    orig_type = w.dtype
     size_k, size_n = w.shape
 
     assert w.is_floating_point(), "w must be float"
-    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
-    assert group_size in SUPPORTED_GROUP_SIZES + [
-        size_k
-    ], f"Unsupported groupsize = {group_size}"
 
     if group_size == -1:
         group_size = size_k
     assert group_size <= size_k
 
-    max_q_val = 2**num_bits - 1
-    half_q_val = (max_q_val + 1) // 2
-
     # Reshape to [groupsize, -1]
     if group_size < size_k:
         w = w.reshape((-1, group_size, size_n))
@@ -99,16 +102,34 @@ def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
         w = w.reshape((group_size, -1))
 
     # Compute scale for each group
-    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
-    s *= 2 / max_q_val  # 2 => symmetric
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+
+    max_q_val = quant_type.max()
+    min_q_val = quant_type.min()
+
+    if zero_points:
+        assert not quant_type.is_signed() and quant_type.max() > 0
+        w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+        maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
+            .clamp(min_q_val, max_q_val).int()
+    else:
+        # If the bias is such that there are no possible negative/positive
+        #  values, set the max value to inf to avoid divide by 0
+        w_s = torch.max(
+            abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+            abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
+        maybe_w_zp = None
 
     # Quantize
-    q_w = torch.round(w / s).int()
-    q_w += half_q_val
-    q_w = torch.clamp(q_w, 0, max_q_val)
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
 
     # Compute ref (dequantized)
-    w_ref = (q_w - half_q_val).half() * s
+    w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+
+    if quant_type.has_bias():
+        w_q += quant_type.bias
 
     # Restore original shapes
     if group_size < size_k:
@@ -119,90 +140,48 @@ def reshape_w(w):
             w = w.reshape((size_k, size_n)).contiguous()
             return w
 
-        q_w = reshape_w(q_w)
+        w_q = reshape_w(w_q)
         w_ref = reshape_w(w_ref)
 
-    s = s.reshape((-1, size_n)).contiguous()
+    w_s = w_s.reshape((-1, size_n)).contiguous()
 
-    # Apply act_order
-    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
-    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
-    if act_order:
-        assert (
-            group_size < size_k
-        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
-            group_size, size_k)
-
-        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size)
+    if zero_points:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
 
     return (
         w_ref.to(device=orig_device),
-        q_w.to(device=orig_device),
-        s.to(device=orig_device),
-        g_idx.to(device=orig_device),
-        rand_perm.to(device=orig_device),
+        w_q.to(device=orig_device),
+        w_s.to(device=orig_device),
+        maybe_w_zp,
     )
 
 
-def quantize_weights_with_zp(w: torch.Tensor, num_bits: int, group_size: int):
-    orig_device = w.device
-    size_k, size_n = w.shape
+def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
+                          group_size: int, act_order: bool):
+    size_k, _ = w.shape
 
     assert w.is_floating_point(), "w must be float"
-    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, \
+        f"Unsupported gptq type = {quant_type}"
     assert group_size in SUPPORTED_GROUP_SIZES + [
         size_k
     ], f"Unsupported groupsize = {group_size}"
 
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
-
-    max_q_val = 2**num_bits - 1
-    min_q_val = 0
+    w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
 
-    # Reshape to [groupsize, -1]
-    if group_size < size_k:
-        w = w.reshape((-1, group_size, size_n))
-        w = w.permute(1, 0, 2)
-        w = w.reshape((group_size, -1))
-
-    # Compute scale for each group
-    max = torch.max(w, 0, keepdim=True)[0]
-    min = torch.min(w, 0, keepdim=True)[0]
-    s = (max - min).clamp(min=1e-5) / max_q_val
-
-    # Compute zero-point for each group
-    zp = (-torch.round(min / s)).clamp(min_q_val, max_q_val).int()
-
-    # Quantize
-    q_w = torch.round(w / s).int() + zp
-    q_w = torch.clamp(q_w, min_q_val, max_q_val)
-
-    # Compute ref (dequantized)
-    w_ref = (q_w - zp).half() * s
-
-    # Restore original shapes
-    if group_size < size_k:
-
-        def reshape_w(w):
-            w = w.reshape((group_size, -1, size_n))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((size_k, size_n)).contiguous()
-            return w
-
-        q_w = reshape_w(q_w)
-        w_ref = reshape_w(w_ref)
+    # Apply act_order
+    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
+    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        assert (
+            group_size < size_k
+        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
+            group_size, size_k)
 
-    s = s.reshape((-1, size_n)).contiguous()
-    zp = zp.reshape((-1, size_n)).contiguous()
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size)
 
-    return (
-        w_ref.to(device=orig_device),
-        q_w.to(device=orig_device),
-        s.to(device=orig_device),
-        zp.to(device=orig_device),
-    )
+    return w_ref, w_q, w_s, g_idx, rand_perm
 
 
 # QQQ employs different quant schemes for per-group and
@@ -212,7 +191,8 @@ def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
     size_k, size_n = w.shape
 
     assert w.is_floating_point(), "w must be float"
-    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \
+           f"Unsupported num_bits = {num_bits}"
     assert group_size in SUPPORTED_GROUP_SIZES + [
         size_k
     ], f"Unsupported groupsize = {group_size}"
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
new file mode 100644
index 0000000000000..eb491dd1554a8
--- /dev/null
+++ b/vllm/scalar_type.py
@@ -0,0 +1,35 @@
+from ._core_ext import NanRepr, ScalarType
+
+# naming generally follows: https://github.com/jax-ml/ml_dtypes
+# for floating point types (leading f) the scheme is:
+#  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+#  flags:
+#  - no-flags: means it follows IEEE 754 conventions
+#  - f: means finite values only (no infinities)
+#  - n: means nans are supported (non-standard encoding)
+# for integer types the scheme is:
+#  `[u]int<size_bits>[b<bias>]`
+#  - if bias is not present it means its zero
+
+
+class scalar_types:
+    int4 = ScalarType.int_(4, None)
+    uint4 = ScalarType.uint(4, None)
+    int8 = ScalarType.int_(8, None)
+    uint8 = ScalarType.uint(8, None)
+    float8_e4m3fn = ScalarType.float_(4, 3, True,
+                                      NanRepr.EXTD_RANGE_MAX_MIN.value)
+    float8_e5m2 = ScalarType.float_IEEE754(5, 2)
+    float16_e8m7 = ScalarType.float_IEEE754(8, 7)
+    float16_e5m10 = ScalarType.float_IEEE754(5, 10)
+
+    # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
+    float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE.value)
+
+    # "gptq" types
+    uint4b8 = ScalarType.uint(4, 8)
+    uint8b128 = ScalarType.uint(8, 128)
+
+    # colloquial names
+    bfloat16 = float16_e8m7
+    float16 = float16_e5m10

From 05308891e203329a733bcf29a3452b15b75b5eb4 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 2 Aug 2024 13:55:40 -0700
Subject: [PATCH 140/167] [Core] Pipeline parallel with Ray ADAG (#6837)

Support pipeline-parallelism with Ray accelerated DAG.

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 Dockerfile                                  |   2 +
 MANIFEST.in                                 |   1 +
 requirements-adag.txt                       |   3 +
 requirements-test.txt                       |   3 +
 tests/distributed/test_pipeline_parallel.py |  51 +++++---
 tests/utils.py                              |  31 ++++-
 vllm/envs.py                                |  12 +-
 vllm/executor/ray_gpu_executor.py           | 137 +++++++++++++-------
 vllm/executor/ray_utils.py                  |  30 ++++-
 vllm/worker/worker_base.py                  |   6 +-
 10 files changed, 199 insertions(+), 77 deletions(-)
 create mode 100644 requirements-adag.txt

diff --git a/Dockerfile b/Dockerfile
index 7294707046abc..49aaea2949ac6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,6 +42,7 @@ WORKDIR /workspace
 
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
+COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
@@ -78,6 +79,7 @@ COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
+COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
diff --git a/MANIFEST.in b/MANIFEST.in
index 82be639ef4d73..5a41e5e714184 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include LICENSE
+include requirements-adag.txt
 include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
diff --git a/requirements-adag.txt b/requirements-adag.txt
new file mode 100644
index 0000000000000..e77f90fb8f85d
--- /dev/null
+++ b/requirements-adag.txt
@@ -0,0 +1,3 @@
+# Dependencies for Ray accelerated DAG
+cupy-cuda12x
+ray >= 2.32
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index df247496be16c..5f3fd15c7ee56 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,3 +1,6 @@
+# Needed for Ray accelerated DAG tests
+-r requirements-adag.txt
+
 # testing
 pytest
 tensorizer>=2.9.0
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index f632caba9017e..ab325e0966929 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -15,22 +15,31 @@
 
 
 @pytest.mark.parametrize(
-    "TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND",
-    [
-        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-    ])
-@fork_new_process_for_each_test
+    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
+     "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [
+         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+     ])
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
-                    DIST_BACKEND):
+                    DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL):
     if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
         pytest.skip("Skipping multi-node pipeline parallel test for "
                     "multiprocessing distributed backend")
@@ -67,8 +76,18 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
     if EAGER_MODE:
         pp_args.append("--enforce-eager")
         tp_args.append("--enforce-eager")
+    pp_env = None
+    if USE_RAY_ADAG:
+        assert DIST_BACKEND == "ray", (
+            "Ray ADAG is only supported with Ray distributed backend")
+        pp_env = {
+            "VLLM_USE_RAY_COMPILED_DAG": "1",
+            "VLLM_USE_RAY_SPMD_WORKER": "1",
+            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
+            str(int(USE_RAY_ADAG_NCCL)),
+        }
 
-    compare_two_settings(MODEL_NAME, pp_args, tp_args)
+    compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
 
 
 @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
diff --git a/tests/utils.py b/tests/utils.py
index f3ee801ee7742..dd8af8e3afe70 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -7,7 +7,7 @@
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import openai
 import ray
@@ -57,6 +57,7 @@ def __init__(
         model: str,
         cli_args: List[str],
         *,
+        env_dict: Optional[Dict[str, str]] = None,
         auto_port: bool = True,
     ) -> None:
         if auto_port:
@@ -77,6 +78,8 @@ def __init__(
         # the current process might initialize cuda,
         # to be safe, we should use spawn method
         env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        if env_dict is not None:
+            env.update(env_dict)
         self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
                                      env=env,
                                      stdout=sys.stdout,
@@ -89,6 +92,11 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         self.proc.terminate()
+        try:
+            self.proc.wait(3)
+        except subprocess.TimeoutExpired:
+            # force kill if needed
+            self.proc.kill()
 
     def _wait_for_server(self, *, url: str, timeout: float):
         # run health check
@@ -127,10 +135,21 @@ def get_async_client(self):
         )
 
 
-def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
+def compare_two_settings(model: str,
+                         arg1: List[str],
+                         arg2: List[str],
+                         env1: Optional[Dict[str, str]] = None,
+                         env2: Optional[Dict[str, str]] = None):
     """
-    Launch API server with two different sets of arguments and compare the
-    results of the API calls. The arguments are after the model name.
+    Launch API server with two different sets of arguments/environments
+    and compare the results of the API calls.
+
+    Args:
+        model: The model to test.
+        arg1: The first set of arguments to pass to the API server.
+        arg2: The second set of arguments to pass to the API server.
+        env1: The first set of environment variables to pass to the API server.
+        env2: The second set of environment variables to pass to the API server.
     """
 
     tokenizer = AutoTokenizer.from_pretrained(model)
@@ -138,8 +157,8 @@ def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
     prompt = "Hello, my name is"
     token_ids = tokenizer(prompt)["input_ids"]
     results = []
-    for args in (arg1, arg2):
-        with RemoteOpenAIServer(model, args) as server:
+    for args, env in ((arg1, env1), (arg2, env2)):
+        with RemoteOpenAIServer(model, args, env_dict=env) as server:
             client = server.get_client()
 
             # test models list
diff --git a/vllm/envs.py b/vllm/envs.py
index 9bcb26f8e5a64..5b8a65bd6545c 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -38,6 +38,7 @@
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
+    VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -273,13 +274,20 @@ def get_default_config_root():
     # execution on all workers.
     # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
     "VLLM_USE_RAY_SPMD_WORKER":
-    lambda: bool(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0)),
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),
 
     # If the env var is set, it uses the Ray's compiled DAG API
     # which optimizes the control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
     "VLLM_USE_RAY_COMPILED_DAG":
-    lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
+
+    # If the env var is set, it uses NCCL for communication in
+    # Ray's compiled DAG. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
+                 ),
 
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 14007e5518d4a..46d216910a08a 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -105,12 +105,19 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         # The remaining workers are the actual ray actors.
         self.workers: List[RayWorkerWrapper] = []
 
+        # Used in ray compiled DAG: indexed first by PP rank,
+        # and then TP rank. In other words, the inner list is
+        # the TP group of workers for a PP rank.
+        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
+
         if self.parallel_config.ray_workers_use_nsight:
             ray_remote_kwargs = self._configure_ray_workers_use_nsight(
                 ray_remote_kwargs)
 
+        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
         # Create the workers.
         driver_ip = get_ip()
+        logger.info("driver_ip: %s", driver_ip)
         worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
@@ -142,42 +149,49 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     # Else, added to the list of workers.
                     self.workers.append(worker)
 
+        logger.debug("workers: %s", self.workers)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
         if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
             raise ValueError(
                 "Ray does not allocate any GPUs on the driver node. Consider "
                 "adjusting the Ray placement group or running the driver on a "
                 "GPU node.")
 
+        worker_ips = [
+            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
+            for worker in self.workers
+        ]
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        def sort_by_driver_then_worker_ip(worker):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = ray.get(worker.get_node_ip.remote())
+            return (ip != driver_ip, ip_counts[ip], ip)
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
+
         # Get the set of GPU IDs used on each node.
         worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
                                                     use_dummy_driver=True)
 
-        # the order in `worker_node_and_gpu_ids` does not necessarily match
-        # the machine boundaries. We need to make sure that workers in the
-        # same node are assigned consecutive ranks.
-        # examples:
-        # [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa
-
-        # initialize worker ranks with -1 (unassigned)
-        worker_ranks = [-1 for x in worker_node_and_gpu_ids]
-        current_rank = 0
-        while -1 in worker_ranks:
-            # whenever we find an unassigned worker, find the node
-            index = worker_ranks.index(-1)
-            current_node_id = worker_node_and_gpu_ids[index][0]
-            # assign ranks to all workers in the same node
-            for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
-                if node_id == current_node_id:
-                    worker_ranks[i] = current_rank
-                    current_rank += 1
-        # with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3]
-
         node_workers = defaultdict(list)  # node id -> list of worker ranks
         node_gpus = defaultdict(list)  # node id -> list of gpu ids
 
-        for worker_rank, (node_id, gpu_ids) in zip(worker_ranks,
-                                                   worker_node_and_gpu_ids):
-            node_workers[node_id].append(worker_rank)
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
             # `gpu_ids` can be a list of strings or integers.
             # convert them to integers for consistency.
             # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
@@ -202,16 +216,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
 
-        if len(node_gpus) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
@@ -221,8 +225,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 local_rank=node_workers[node_id].index(rank),
                 rank=rank,
                 distributed_init_method=distributed_init_method,
-            ) for rank, (node_id,
-                         _) in zip(worker_ranks, worker_node_and_gpu_ids)
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
         ]
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
@@ -231,6 +234,19 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
 
+        if self.use_ray_spmd_worker:
+            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+                self.pp_tp_workers.append([])
+                for tp_rank in range(
+                        self.parallel_config.tensor_parallel_size):
+                    # PP=2, TP=4
+                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
+                            ) + tp_rank
+                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
+                    assert pp_rank < len(self.pp_tp_workers)
+                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
+
         # This is the list of workers that are rank 0 of each TP group EXCEPT
         # global rank 0. These are the workers that will broadcast to the
         # rest of the workers.
@@ -241,9 +257,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         self.non_driver_workers: List[RayWorkerWrapper] = []
 
         # Enforce rank order for correct rank to return final output.
-        for rank, worker in sorted(zip(worker_ranks[1:], self.workers)):
-            # We need to skip the driver worker, which we
-            # do by skipping worker_ranks[0] which is always 0.
+        for index, worker in enumerate(self.workers):
+            # The driver worker is rank 0 and not in self.workers.
+            rank = index + 1
             if rank % self.parallel_config.tensor_parallel_size == 0:
                 self.tp_driver_workers.append(worker)
             else:
@@ -376,16 +392,47 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
             raise ValueError(f"Ray version {required_version} or greater is "
                              f"required, but found {current_version}")
 
-        from ray.dag import InputNode, MultiOutputNode
         assert self.parallel_config.use_ray
+        from ray.dag import InputNode, MultiOutputNode
+        from ray.experimental.channel.torch_tensor_type import TorchTensorType
 
-        # Right now, compiled DAG requires at least 1 arg. We send
-        # a dummy value for now. It will be fixed soon.
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
         with InputNode() as input_data:
-            forward_dag = MultiOutputNode([
-                worker.execute_model_spmd.bind(  # type: ignore[attr-defined]
-                    input_data) for worker in self.workers
-            ])
+            # Example DAG: PP=2, TP=4
+            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
+            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
+            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
+            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
+
+            # All workers in the first TP group will take in the
+            # ExecuteModelRequest as input.
+            outputs = [input_data for _ in self.pp_tp_workers[0]]
+            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
+                # Each PP worker takes in the output of the previous PP worker,
+                # and the TP group executes in SPMD fashion.
+                outputs = [
+                    worker.execute_model_spmd.
+                    bind(  # type: ignore[attr-defined]
+                        outputs[i]) for i, worker in enumerate(tp_group)
+                ]
+
+                last_pp_rank = len(self.pp_tp_workers) - 1
+                if pp_rank < last_pp_rank:
+                    # Specify how intermediate tensors should be passed
+                    # between pp stages, no need to specify for the last
+                    # pp stage.
+                    transport = "nccl" \
+                        if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
+                        else "auto"
+                    outputs = [
+                        output.with_type_hint(
+                            TorchTensorType(transport=transport))
+                        for output in outputs
+                    ]
+
+            forward_dag = MultiOutputNode(outputs)
+
         return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
 
     def __del__(self):
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 58b864070f727..ac948331e81e0 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,8 +1,8 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.sequence import ExecuteModelRequest
+from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip, is_hip, is_tpu, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -31,9 +31,17 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             gpu_ids = ray.get_gpu_ids()
             return node_id, gpu_ids
 
-        def execute_model_spmd(self, execute_model_req: ExecuteModelRequest):
-            """Used only when SPMD worker and compiled DAG are both
-            enabled."""
+        def execute_model_spmd(
+            self, req_or_tuple: Union[ExecuteModelRequest,
+                                      Tuple[ExecuteModelRequest,
+                                            IntermediateTensors]]):
+            """Execute model in SPMD fashion: used only when SPMD worker and
+            compiled DAG are both enabled.
+
+            Args:
+                req_or_tuple: The request to execute the model, or a tuple
+                    containing the request and intermediate tensors.
+            """
             # TODO(swang): This is needed right now because Ray aDAG executes
             # on a background thread, so we need to reset torch's current
             # device.
@@ -42,7 +50,17 @@ def execute_model_spmd(self, execute_model_req: ExecuteModelRequest):
                 torch.cuda.set_device(self.worker.device)
                 self.compiled_dag_cuda_device_set = True
 
-            return self.worker._execute_model_spmd(execute_model_req)
+            if isinstance(req_or_tuple, tuple):
+                execute_model_req, intermediate_tensors = req_or_tuple
+            else:
+                execute_model_req = req_or_tuple
+                intermediate_tensors = None
+
+            output = self.worker._execute_model_spmd(execute_model_req,
+                                                     intermediate_tensors)
+            if isinstance(output, IntermediateTensors):
+                return execute_model_req, output
+            return output
 
     ray_import_err = None
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 8a4d1958c65a0..e56440693b895 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -285,7 +285,9 @@ def execute_model(
         return output
 
     def _execute_model_spmd(
-        self, execute_model_req: ExecuteModelRequest
+        self,
+        execute_model_req: ExecuteModelRequest,
+        intermediate_tensors: Optional[IntermediateTensors] = None
     ) -> Optional[List[SamplerOutput]]:
         """
         Execute model in Single Program Multiple Data (SPMD) fashion.
@@ -309,7 +311,7 @@ def _execute_model_spmd(
 
         return self.model_runner.execute_model(
             model_input, self.kv_cache[worker_input.virtual_engine]
-            if self.kv_cache is not None else None)
+            if self.kv_cache is not None else None, intermediate_tensors)
 
 
 class WorkerWrapperBase:

From 22e718ff1a51930231d87c89d6c43676af59860b Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:50:00 -0700
Subject: [PATCH 141/167] [Misc] Revive to use loopback address for driver IP
 (#7091)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/executor/ray_gpu_executor.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 46d216910a08a..4a6825c01fcf8 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -216,6 +216,16 @@ def sort_by_driver_then_worker_ip(worker):
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
 
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 

From 708989341ef6361a5981d890a0e2f1b794323458 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 16:18:45 -0700
Subject: [PATCH 142/167] [misc] add a flag to enable compile (#7092)

---
 vllm/envs.py                | 4 ++++
 vllm/worker/model_runner.py | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 5b8a65bd6545c..595058bcbb027 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -174,6 +174,10 @@ def get_default_config_root():
     lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
              ("true", "1")),
 
+    # Internal flag to enable Dynamo graph capture
+    "VLLM_TEST_DYNAMO_GRAPH_CAPTURE":
+    lambda: int(os.environ.get("VLLM_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
+
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
     "LOCAL_RANK":
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 7773442899585..f9c26e0c318b1 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -23,6 +23,7 @@
     BatchPrefillWithPagedKVCacheWrapper = None
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
 
+import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
@@ -786,6 +787,11 @@ def load_model(self) -> None:
                     "provided. Defaulting to scaling factors of 1.0. "
                     "This may lead to less accurate results!")
 
+        if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE:
+            self.model = torch.compile(self.model,
+                                       fullgraph=True,
+                                       backend="eager")
+
     def save_sharded_state(
         self,
         path: str,

From ed812a73fae77bb520b739cfeaad36dbd61e2b03 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 2 Aug 2024 21:27:28 -0400
Subject: [PATCH 143/167] [ Frontend ] Multiprocessing for OpenAI Server with
 `zeromq` (#6883)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Joe Runde <joe@joerun.de>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 tests/entrypoints/openai/test_disable_mp.py   | 715 ++++++++++++++++++
 vllm/engine/async_llm_engine.py               |  27 +-
 vllm/engine/llm_engine.py                     |  36 +-
 vllm/engine/protocol.py                       |  84 ++
 vllm/entrypoints/openai/api_server.py         | 132 +++-
 vllm/entrypoints/openai/cli_args.py           |   9 +-
 vllm/entrypoints/openai/logits_processors.py  |  19 +-
 vllm/entrypoints/openai/rpc/__init__.py       |  42 +
 vllm/entrypoints/openai/rpc/client.py         | 248 ++++++
 vllm/entrypoints/openai/rpc/server.py         | 216 ++++++
 vllm/entrypoints/openai/serving_chat.py       |  16 +-
 vllm/entrypoints/openai/serving_completion.py |  19 +-
 vllm/entrypoints/openai/serving_embedding.py  |  13 +-
 vllm/entrypoints/openai/serving_engine.py     |   8 +-
 .../openai/serving_tokenization.py            |  10 +-
 vllm/envs.py                                  |   6 +
 .../outlines_logits_processors.py             |  19 +
 vllm/tracing.py                               |   2 +-
 .../tokenizer_group/__init__.py               |  19 +-
 vllm/utils.py                                 |  28 +-
 20 files changed, 1567 insertions(+), 101 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_disable_mp.py
 create mode 100644 vllm/engine/protocol.py
 create mode 100644 vllm/entrypoints/openai/rpc/__init__.py
 create mode 100644 vllm/entrypoints/openai/rpc/client.py
 create mode 100644 vllm/entrypoints/openai/rpc/server.py

diff --git a/tests/entrypoints/openai/test_disable_mp.py b/tests/entrypoints/openai/test_disable_mp.py
new file mode 100644
index 0000000000000..12c805413311c
--- /dev/null
+++ b/tests/entrypoints/openai/test_disable_mp.py
@@ -0,0 +1,715 @@
+"""
+Repeat of tests in test_completion.py with the non-mp backend.
+"""
+
+# imports for guided decoding tests
+import json
+import re
+import shutil
+from tempfile import TemporaryDirectory
+from typing import List
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically these adapters use a different base model,
+# but we're not testing generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+PA_NAME = "swapnilbp/llama_tweet_ptune"
+# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
+# need to change to match the prompt adapter
+PA_NUM_VIRTUAL_TOKENS = 8
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
+
+
+@pytest.fixture(scope="module")
+def zephyr_pa_files():
+    return snapshot_download(repo_id=PA_NAME)
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
+                        zephyr_pa_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        # pa config
+        "--enable-prompt-adapter",
+        "--prompt-adapters",
+        f"zephyr-pa={zephyr_pa_files}",
+        f"zephyr-pa2={zephyr_pa_files}",
+        "--max-prompt-adapters",
+        "2",
+        "--max-prompt-adapter-token",
+        "128",
+        "--disable-frontend-multiprocessing"
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name,num_virtual_tokens",
+    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
+     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
+     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
+                                 num_virtual_tokens: int):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5,
+        prompt_tokens=6 + num_virtual_tokens,
+        total_tokens=11 + num_virtual_tokens)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model="zephyr-lora2",
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should appear in tokenized prompt
+    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should not appear in tokenized prompt
+    assert "vllm" not in completion.choices[0].text
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora and 1 pa hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str,
+                                      sample_json_schema):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str,
+                                       sample_regex):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(sample_regex,
+                            completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str,
+                                        sample_guided_choice):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in sample_guided_choice
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+                              sample_sql_statements):
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=sample_sql_statements))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(sample_sql_statements)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str,
+                                          sample_json_schema, sample_regex):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=sample_regex,
+                            guided_json=sample_json_schema))
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index d3f9a0ab00f10..c39caca25cc7a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -7,7 +7,8 @@
 from transformers import PreTrainedTokenizer
 
 import vllm.envs as envs
-from vllm.config import DecodingConfig, EngineConfig, ModelConfig
+from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -928,6 +929,14 @@ async def get_model_config(self) -> ModelConfig:
         else:
             return self.engine.get_model_config()
 
+    async def get_parallel_config(self) -> ParallelConfig:
+        """Get the parallel configuration of the vLLM engine."""
+        if self.engine_use_ray:
+            return await self.engine.get_parallel_config.remote(  # type: ignore
+            )
+        else:
+            return self.engine.get_parallel_config()
+
     async def get_decoding_config(self) -> DecodingConfig:
         """Get the decoding configuration of the vLLM engine."""
         if self.engine_use_ray:
@@ -936,6 +945,22 @@ async def get_decoding_config(self) -> DecodingConfig:
         else:
             return self.engine.get_decoding_config()
 
+    async def get_scheduler_config(self) -> SchedulerConfig:
+        """Get the scheduling configuration of the vLLM engine."""
+        if self.engine_use_ray:
+            return await self.engine.get_scheduler_config.remote(  # type: ignore
+            )
+        else:
+            return self.engine.get_scheduler_config()
+
+    async def get_lora_config(self) -> LoRAConfig:
+        """Get the lora configuration of the vLLM engine."""
+        if self.engine_use_ray:
+            return await self.engine.get_lora_config.remote(  # type: ignore
+            )
+        else:
+            return self.engine.get_lora_config()
+
     async def do_log_stats(
             self,
             scheduler_outputs: Optional[SchedulerOutputs] = None,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 1efe2206abe81..3747f93b16cd1 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -38,9 +38,8 @@
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer_group import (AnyTokenizer,
-                                                     BaseTokenizerGroup,
-                                                     get_tokenizer_group)
+from vllm.transformers_utils.tokenizer_group import (
+    AnyTokenizer, BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
 from vllm.utils import Counter
@@ -485,19 +484,12 @@ def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
         return self.get_tokenizer_group().get_lora_tokenizer(
             sequence.lora_request)
 
-    def _init_tokenizer(self, **tokenizer_init_kwargs) -> BaseTokenizerGroup:
-        init_kwargs = dict(
-            tokenizer_id=self.model_config.tokenizer,
-            enable_lora=bool(self.lora_config),
-            max_num_seqs=self.scheduler_config.max_num_seqs,
-            max_input_length=None,
-            tokenizer_mode=self.model_config.tokenizer_mode,
-            trust_remote_code=self.model_config.trust_remote_code,
-            revision=self.model_config.tokenizer_revision)
-        init_kwargs.update(tokenizer_init_kwargs)
-
-        return get_tokenizer_group(self.parallel_config.tokenizer_pool_config,
-                                   **init_kwargs)
+    def _init_tokenizer(self) -> BaseTokenizerGroup:
+        return init_tokenizer_from_configs(
+            model_config=self.model_config,
+            scheduler_config=self.scheduler_config,
+            parallel_config=self.parallel_config,
+            enable_lora=bool(self.lora_config))
 
     def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)
@@ -759,10 +751,22 @@ def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
         return self.model_config
 
+    def get_parallel_config(self) -> ParallelConfig:
+        """Gets the parallel configuration."""
+        return self.parallel_config
+
     def get_decoding_config(self) -> DecodingConfig:
         """Gets the decoding configuration."""
         return self.decoding_config
 
+    def get_scheduler_config(self) -> SchedulerConfig:
+        """Gets the scheduler configuration."""
+        return self.scheduler_config
+
+    def get_lora_config(self) -> LoRAConfig:
+        """Gets the LoRA configuration."""
+        return self.lora_config
+
     def get_num_unfinished_requests(self) -> int:
         """Gets the number of unfinished requests."""
         return sum(scheduler.get_num_unfinished_seq_groups()
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
new file mode 100644
index 0000000000000..fc94ef6662e0a
--- /dev/null
+++ b/vllm/engine/protocol.py
@@ -0,0 +1,84 @@
+from typing import (AsyncIterator, List, Mapping, Optional, Protocol,
+                    runtime_checkable)
+
+from transformers import PreTrainedTokenizer
+
+from vllm.config import DecodingConfig, ModelConfig
+from vllm.core.scheduler import SchedulerOutputs
+from vllm.inputs.data import PromptInputs
+from vllm.lora.request import LoRARequest
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import SamplerOutput
+
+
+@runtime_checkable
+class AsyncEngineClient(Protocol):
+    """Protocol class for Clients to AsyncLLMEngine"""
+
+    @property
+    def is_running(self) -> bool:
+        ...
+
+    @property
+    def is_stopped(self) -> bool:
+        ...
+
+    @property
+    def errored(self) -> bool:
+        ...
+
+    async def generate(
+        self,
+        inputs: PromptInputs,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
+    ) -> AsyncIterator[RequestOutput]:
+        """Generates outputs for a request"""
+
+    async def encode(
+        self,
+        inputs: PromptInputs,
+        pooling_params: PoolingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+    ) -> AsyncIterator[EmbeddingRequestOutput]:
+        """Generate outputs for a request from an embedding model."""
+
+    async def abort(self, request_id: str) -> None:
+        """Abort a request.
+
+        Args:
+            request_id: The unique id of the request.
+        """
+
+    async def get_model_config(self) -> ModelConfig:
+        """Get the model configuration of the vLLM engine."""
+
+    async def get_decoding_config(self) -> DecodingConfig:
+        """Get the decoding configuration of the vLLM engine."""
+
+    async def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> PreTrainedTokenizer:
+        """Get the appropriate Tokenizer for the request"""
+
+    async def is_tracing_enabled(self) -> bool:
+        pass
+
+    async def do_log_stats(
+        self,
+        scheduler_outputs: Optional[SchedulerOutputs] = None,
+        model_output: Optional[List[SamplerOutput]] = None,
+    ) -> None:
+        pass
+
+    async def check_health(self) -> None:
+        """Raise if unhealthy"""
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 0fe4dd245b5e6..e330ee81f7e44 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -5,7 +5,8 @@
 import signal
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Optional, Set
+from multiprocessing import Process
+from typing import AsyncIterator, Set
 
 import fastapi
 import uvicorn
@@ -17,8 +18,10 @@
 from starlette.routing import Mount
 
 import vllm.envs as envs
+from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 # yapf conflicts with isort for this block
@@ -31,6 +34,8 @@
                                               EmbeddingRequest, ErrorResponse,
                                               TokenizeRequest,
                                               TokenizeResponse)
+from vllm.entrypoints.openai.rpc.client import AsyncEngineRPCClient
+from vllm.entrypoints.openai.rpc.server import run_rpc_server
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -39,12 +44,12 @@
     OpenAIServingTokenization)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, get_open_port
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
-engine: AsyncLLMEngine
+async_engine_client: AsyncEngineClient
 engine_args: AsyncEngineArgs
 openai_serving_chat: OpenAIServingChat
 openai_serving_completion: OpenAIServingCompletion
@@ -56,13 +61,22 @@
 _running_tasks: Set[asyncio.Task] = set()
 
 
+def model_is_embedding(model_name: str) -> bool:
+    return ModelConfig(model=model_name,
+                       tokenizer=model_name,
+                       tokenizer_mode="auto",
+                       trust_remote_code=False,
+                       seed=0,
+                       dtype="float16").embedding_mode
+
+
 @asynccontextmanager
 async def lifespan(app: fastapi.FastAPI):
 
     async def _force_log():
         while True:
             await asyncio.sleep(10)
-            await engine.do_log_stats()
+            await async_engine_client.do_log_stats()
 
     if not engine_args.disable_log_stats:
         task = asyncio.create_task(_force_log())
@@ -72,6 +86,52 @@ async def _force_log():
     yield
 
 
+@asynccontextmanager
+async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
+    # Context manager to handle async_engine_client lifecycle
+    # Ensures everything is shutdown and cleaned up on error/exit
+    global engine_args
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+
+    # Backend itself still global for the silly lil' health handler
+    global async_engine_client
+
+    # If manually triggered or embedding model, use AsyncLLMEngine in process.
+    # TODO: support embedding model via RPC.
+    if (model_is_embedding(args.model)
+            or args.disable_frontend_multiprocessing):
+        async_engine_client = AsyncLLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+        yield async_engine_client
+        return
+
+    # Otherwise, use the multiprocessing AsyncLLMEngine.
+    else:
+        # Start RPCServer in separate process (holds the AsyncLLMEngine).
+        port = get_open_port(envs.VLLM_RPC_PORT)
+        rpc_server_process = Process(target=run_rpc_server,
+                                     args=(engine_args,
+                                           UsageContext.OPENAI_API_SERVER,
+                                           port))
+        rpc_server_process.start()
+
+        # Build RPCClient, which conforms to AsyncEngineClient Protocol.
+        async_engine_client = AsyncEngineRPCClient(port)
+        await async_engine_client.setup()
+
+        try:
+            yield async_engine_client
+        finally:
+            # Ensure rpc server process was terminated
+            rpc_server_process.terminate()
+
+            # Close all open connections to the backend
+            async_engine_client.close()
+
+            # Wait for server process to join
+            rpc_server_process.join()
+
+
 router = APIRouter()
 
 
@@ -86,7 +146,7 @@ def mount_metrics(app: fastapi.FastAPI):
 @router.get("/health")
 async def health() -> Response:
     """Health check."""
-    await openai_serving_chat.engine.check_health()
+    await async_engine_client.check_health()
     return Response(status_code=200)
 
 
@@ -215,8 +275,8 @@ async def authentication(request: Request, call_next):
 
 
 async def build_server(
+    async_engine_client: AsyncEngineClient,
     args,
-    llm_engine: Optional[AsyncLLMEngine] = None,
     **uvicorn_kwargs,
 ) -> uvicorn.Server:
     app = build_app(args)
@@ -226,14 +286,7 @@ async def build_server(
     else:
         served_model_names = [args.model]
 
-    global engine, engine_args
-
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = (llm_engine
-              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
-                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
-
-    model_config = await engine.get_model_config()
+    model_config = await async_engine_client.get_model_config()
 
     if args.disable_log_requests:
         request_logger = None
@@ -246,7 +299,7 @@ async def build_server(
     global openai_serving_tokenization
 
     openai_serving_chat = OpenAIServingChat(
-        engine,
+        async_engine_client,
         model_config,
         served_model_names,
         args.response_role,
@@ -257,7 +310,7 @@ async def build_server(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     )
     openai_serving_completion = OpenAIServingCompletion(
-        engine,
+        async_engine_client,
         model_config,
         served_model_names,
         lora_modules=args.lora_modules,
@@ -266,13 +319,13 @@ async def build_server(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     )
     openai_serving_embedding = OpenAIServingEmbedding(
-        engine,
+        async_engine_client,
         model_config,
         served_model_names,
         request_logger=request_logger,
     )
     openai_serving_tokenization = OpenAIServingTokenization(
-        engine,
+        async_engine_client,
         model_config,
         served_model_names,
         lora_modules=args.lora_modules,
@@ -304,32 +357,39 @@ async def build_server(
     return uvicorn.Server(config)
 
 
-async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
+async def run_server(args, **uvicorn_kwargs) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
-    server = await build_server(
-        args,
-        llm_engine,
-        **uvicorn_kwargs,
-    )
+    shutdown_task = None
+    async with build_async_engine_client(args) as async_engine_client:
+
+        server = await build_server(
+            async_engine_client,
+            args,
+            **uvicorn_kwargs,
+        )
+
+        loop = asyncio.get_running_loop()
 
-    loop = asyncio.get_running_loop()
+        server_task = loop.create_task(server.serve())
 
-    server_task = loop.create_task(server.serve())
+        def signal_handler() -> None:
+            # prevents the uvicorn signal handler to exit early
+            server_task.cancel()
 
-    def signal_handler() -> None:
-        # prevents the uvicorn signal handler to exit early
-        server_task.cancel()
+        loop.add_signal_handler(signal.SIGINT, signal_handler)
+        loop.add_signal_handler(signal.SIGTERM, signal_handler)
 
-    loop.add_signal_handler(signal.SIGINT, signal_handler)
-    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+        try:
+            await server_task
+        except asyncio.CancelledError:
+            logger.info("Gracefully stopping http server")
+            shutdown_task = server.shutdown()
 
-    try:
-        await server_task
-    except asyncio.CancelledError:
-        print("Gracefully stopping http server")
-        await server.shutdown()
+    if shutdown_task:
+        # NB: Await server shutdown only after the backend context is exited
+        await shutdown_task
 
 
 if __name__ == "__main__":
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index a4192937980f7..1facedac72ca8 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -131,9 +131,14 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument(
         "--return-tokens-as-token-ids",
         action="store_true",
-        help="When --max-logprobs is specified, represents single tokens as"
-        "strings of the form 'token_id:{token_id}' so that tokens that"
+        help="When --max-logprobs is specified, represents single tokens as "
+        "strings of the form 'token_id:{token_id}' so that tokens that "
         "are not JSON-encodable can be identified.")
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        help="If specified, will run the OpenAI frontend server in the same "
+        "process as the model serving engine.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
 
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index f8e04e7f18e0f..84871fc83ef5f 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -1,4 +1,4 @@
-from functools import lru_cache
+from functools import lru_cache, partial
 from typing import Dict, FrozenSet, Iterable, List, Optional, Union
 
 import torch
@@ -40,6 +40,14 @@ def _get_allowed_token_ids_logits_processor(
     return AllowedTokenIdsLogitsProcessor(allowed_token_ids)
 
 
+def logit_bias_logits_processor(logit_bias: Dict[str,
+                                                 float], token_ids: List[int],
+                                logits: torch.Tensor) -> torch.Tensor:
+    for token_id, bias in logit_bias.items():
+        logits[token_id] += bias
+    return logits
+
+
 def get_logits_processors(
         logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]],
         allowed_token_ids: Optional[List[int]],
@@ -64,13 +72,8 @@ def get_logits_processors(
                 raise ValueError("token_id in logit_bias contains "
                                  "out-of-vocab token id")
 
-        def logit_bias_logits_processor(token_ids: List[int],
-                                        logits: torch.Tensor) -> torch.Tensor:
-            for token_id, bias in clamped_logit_bias.items():
-                logits[token_id] += bias
-            return logits
-
-        logits_processors.append(logit_bias_logits_processor)
+        logits_processors.append(
+            partial(logit_bias_logits_processor, clamped_logit_bias))
 
     if allowed_token_ids is not None:
         logits_processors.append(
diff --git a/vllm/entrypoints/openai/rpc/__init__.py b/vllm/entrypoints/openai/rpc/__init__.py
new file mode 100644
index 0000000000000..8a7b12201cab7
--- /dev/null
+++ b/vllm/entrypoints/openai/rpc/__init__.py
@@ -0,0 +1,42 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Mapping, Optional, Union
+
+from vllm.inputs import PromptInputs
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+
+VLLM_RPC_SUCCESS_STR = "SUCCESS"
+VLLM_RPC_HEALTHY_STR = "HEALTHY"
+
+
+@dataclass
+class RPCGenerateRequest:
+    inputs: PromptInputs
+    sampling_params: SamplingParams
+    request_id: str
+    lora_request: Optional[LoRARequest] = None
+    trace_headers: Optional[Mapping[str, str]] = None
+    prompt_adapter_request: Optional[PromptAdapterRequest] = None
+
+
+@dataclass
+class RPCAbortRequest:
+    request_id: str
+
+
+class RPCUtilityRequest(Enum):
+    IS_SERVER_READY = 1
+    GET_MODEL_CONFIG = 2
+    GET_DECODING_CONFIG = 3
+    GET_PARALLEL_CONFIG = 4
+    GET_SCHEDULER_CONFIG = 5
+    GET_LORA_CONFIG = 6
+    DO_LOG_STATS = 7
+    CHECK_HEALTH = 8
+    IS_TRACING_ENABLED = 9
+
+
+RPC_REQUEST_TYPE = Union[RPCGenerateRequest, RPCAbortRequest,
+                         RPCUtilityRequest]
diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py
new file mode 100644
index 0000000000000..45bf88b5bf574
--- /dev/null
+++ b/vllm/entrypoints/openai/rpc/client.py
@@ -0,0 +1,248 @@
+from contextlib import contextmanager
+from typing import Any, AsyncIterator, Mapping, Optional
+
+import cloudpickle
+import zmq
+import zmq.asyncio
+
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig)
+from vllm.entrypoints.openai.rpc import (RPC_REQUEST_TYPE,
+                                         VLLM_RPC_HEALTHY_STR,
+                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
+                                         RPCGenerateRequest, RPCUtilityRequest)
+from vllm.inputs import PromptInputs
+from vllm.lora.request import LoRARequest
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+
+
+class AsyncEngineRPCClient:
+
+    def __init__(self, port: int):
+        self.context = zmq.asyncio.Context()
+        self.path = f"tcp://localhost:{port}"
+
+    async def setup(self):
+        """Setup the client before it starts sending server requests."""
+
+        # Wait until server is ready.
+        await self.wait_for_server()
+
+        # Get the configs.
+        self.model_config = await self._get_model_config_rpc()
+        self.decoding_config = await self._get_decoding_config_rpc()
+        self.tracing_flag = await self._is_tracing_enabled_rpc()
+
+        # Create the tokenizer group.
+        # TODO: refactor OAI server to avoid needing this info.
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=self.model_config,
+            scheduler_config=(await self._get_scheduler_config_rpc()),
+            parallel_config=(await self._get_parallel_config_rpc()),
+            enable_lora=bool(await self._get_lora_config_rpc()),
+        )
+
+    def close(self):
+        """Destroy the ZeroMQ Context."""
+        self.context.destroy()
+
+    @contextmanager
+    def socket(self):
+        # Ensure client sockets are always closed after use
+
+        # Connect to RPC socket for Request-Reply pattern,
+        # Note that we use DEALER to enable asynchronous communication
+        # to enable streaming.
+        socket = self.context.socket(zmq.constants.DEALER)
+        try:
+            socket.connect(self.path)
+            yield socket
+        finally:
+            socket.close()
+
+    async def _send_get_data_rpc_request(self, request: RPCUtilityRequest,
+                                         expected_type: Any,
+                                         error_message: str) -> Any:
+        """Send an RPC request that is expecting data back."""
+
+        with self.socket() as socket:
+
+            # Ping RPCServer with a request.
+            await socket.send(cloudpickle.dumps(request))
+
+            # Await the data from the Server.
+            data = cloudpickle.loads(await socket.recv())
+
+        if not isinstance(data, expected_type):
+            # LoRAConfig can be None.
+            if expected_type == LoRAConfig and data is None:
+                pass
+            else:
+                raise ValueError(error_message)
+
+        return data
+
+    async def _send_one_way_rpc_request(self, request: RPC_REQUEST_TYPE,
+                                        error_message: str):
+        """Send one-way RPC request to trigger an action."""
+        with self.socket() as socket:
+            # Ping RPC Server with request.
+            await socket.send(cloudpickle.dumps(request))
+
+            # Await acknowledgement from RPCServer.
+            response = cloudpickle.loads(await socket.recv())
+
+        if not isinstance(response, str) or response != VLLM_RPC_SUCCESS_STR:
+            raise ValueError(error_message)
+
+        return response
+
+    async def get_tokenizer(self, lora_request: LoRARequest):
+        return await self.tokenizer.get_lora_tokenizer_async(lora_request)
+
+    async def get_decoding_config(self) -> DecodingConfig:
+        return self.decoding_config
+
+    async def get_model_config(self) -> ModelConfig:
+        return self.model_config
+
+    async def is_tracing_enabled(self) -> bool:
+        return self.tracing_flag
+
+    async def wait_for_server(self):
+        """Wait for the RPCServer to start up."""
+
+        await self._send_one_way_rpc_request(
+            request=RPCUtilityRequest.IS_SERVER_READY,
+            error_message="Unable to start RPC Server.")
+
+    async def _get_model_config_rpc(self) -> ModelConfig:
+        """Get the ModelConfig object from the RPC Server"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_MODEL_CONFIG,
+            expected_type=ModelConfig,
+            error_message="Could not get ModelConfig from RPC Server")
+
+    async def _get_decoding_config_rpc(self) -> DecodingConfig:
+        """Get DecodingConfig from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_DECODING_CONFIG,
+            expected_type=DecodingConfig,
+            error_message="Could not get DecodingConfig from RPC Server")
+
+    async def _get_parallel_config_rpc(self) -> ParallelConfig:
+        """Get ParallelConfig from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_PARALLEL_CONFIG,
+            expected_type=ParallelConfig,
+            error_message="Could not get ParallelConfig from RPC Server")
+
+    async def _get_scheduler_config_rpc(self) -> SchedulerConfig:
+        """Get SchedulerConfig from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_SCHEDULER_CONFIG,
+            expected_type=SchedulerConfig,
+            error_message="Could not get SchedulerConfig from RPC Server")
+
+    async def _get_lora_config_rpc(self):
+        """Get LoRAConfig from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_LORA_CONFIG,
+            expected_type=LoRAConfig,
+            error_message="Could not get LoRAConfig from RPC Server")
+
+    async def _is_tracing_enabled_rpc(self) -> ParallelConfig:
+        """Get is_tracing_enabled flag from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.IS_TRACING_ENABLED,
+            expected_type=bool,
+            error_message="Could not get is_tracing_enabled flag from RPC "
+            "Server")
+
+    async def abort(self, request_id: str):
+        """Send an ABORT_REQUEST signal to the RPC Server"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCAbortRequest(request_id),
+            error_message=f"RPCAbortRequest {request_id} failed")
+
+    async def do_log_stats(self):
+        """Send a DO_LOG_STATS signal to the RPC Server"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCUtilityRequest.DO_LOG_STATS,
+            error_message="RPCRequest DO_LOG_STATS failed.")
+
+    async def generate(
+        self,
+        inputs: PromptInputs,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
+    ) -> AsyncIterator[RequestOutput]:
+        """Send an RPCGenerateRequest to the RPCServer and stream responses."""
+
+        with self.socket() as socket:
+
+            # Send RPCGenerateRequest to the RPCServer.
+            await socket.send_multipart([
+                cloudpickle.dumps(
+                    RPCGenerateRequest(
+                        inputs=inputs,
+                        sampling_params=sampling_params,
+                        request_id=request_id,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        prompt_adapter_request=prompt_adapter_request))
+            ])
+
+            # Stream back the results from the RPC Server.
+            while True:
+                message = await socket.recv()
+                request_output = cloudpickle.loads(message)
+
+                if isinstance(request_output, Exception):
+                    raise request_output
+
+                if request_output.finished:
+                    break
+                yield request_output
+
+            yield request_output
+
+    async def check_health(self) -> None:
+        """Raise if unhealthy"""
+
+        with self.socket() as socket:
+
+            # Ping RPCServer with CHECK_HEALTH request.
+            await socket.send(cloudpickle.dumps(RPCUtilityRequest.CHECK_HEALTH)
+                              )
+
+            # Await the reply from the server.
+            # TODO: do we need an internal timeout here?
+            # Or do we expect the external probe to timeout and let this chill?
+            health_message = cloudpickle.loads(await socket.recv())
+
+        if isinstance(health_message, Exception):
+            raise health_message
+
+        if health_message != VLLM_RPC_HEALTHY_STR:
+            raise ValueError("Expected healthy response from backend but got "
+                             "f{health_message}")
+
+    async def encode(self, *args,
+                     **kwargs) -> AsyncIterator[EmbeddingRequestOutput]:
+        raise NotImplementedError(
+            "Embeddings not supported with multiprocessing backend")
diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py
new file mode 100644
index 0000000000000..7a72a6f732c99
--- /dev/null
+++ b/vllm/entrypoints/openai/rpc/server.py
@@ -0,0 +1,216 @@
+import asyncio
+import signal
+from typing import Any, Coroutine
+
+import cloudpickle
+import zmq
+import zmq.asyncio
+from typing_extensions import Never
+
+from vllm import AsyncEngineArgs, AsyncLLMEngine
+from vllm.entrypoints.openai.rpc import (VLLM_RPC_HEALTHY_STR,
+                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
+                                         RPCGenerateRequest, RPCUtilityRequest)
+from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
+
+logger = init_logger(__name__)
+
+
+class AsyncEngineRPCServer:
+
+    def __init__(self, async_engine_args: AsyncEngineArgs,
+                 usage_context: UsageContext, port: int):
+        # Initialize engine first.
+        self.engine = AsyncLLMEngine.from_engine_args(async_engine_args,
+                                                      usage_context)
+
+        # Initialize context.
+        self.context = zmq.asyncio.Context()
+
+        # Init socket for readiness state.
+        self.socket = self.context.socket(zmq.constants.ROUTER)
+        self.socket.bind(f"tcp://localhost:{port}")
+
+    def cleanup(self):
+        """Cleanup all resources."""
+        self.socket.close()
+        self.context.destroy()
+
+    async def get_model_config(self, identity):
+        """Send the ModelConfig"""
+        model_config = await self.engine.get_model_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(model_config)])
+
+    async def get_decoding_config(self, identity):
+        """Send the DecodingConfig"""
+        decoding_config = await self.engine.get_decoding_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(decoding_config)])
+
+    async def get_lora_config(self, identity):
+        lora_config = await self.engine.get_lora_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(lora_config)])
+
+    async def get_scheduler_config(self, identity):
+        """Send the SchedulerConfig"""
+        parallel_config = await self.engine.get_scheduler_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(parallel_config)])
+
+    async def get_parallel_config(self, identity):
+        """Send the ParallelConfig"""
+        parallel_config = await self.engine.get_parallel_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(parallel_config)])
+
+    async def is_tracing_enabled(self, identity):
+        """Send the is_tracing_enabled flag"""
+        tracing_flag = await self.engine.is_tracing_enabled()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(tracing_flag)])
+
+    async def do_log_stats(self, identity):
+        """Log stats and confirm success."""
+        await self.engine.do_log_stats()
+
+        await self.socket.send_multipart([
+            identity,
+            cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
+        ])
+
+    async def is_server_ready(self, identity):
+        """Notify the client that we are ready."""
+        await self.socket.send_multipart([
+            identity,
+            cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
+        ])
+
+    async def abort(self, identity, request: RPCAbortRequest):
+        """Abort request and notify the client of success."""
+        # Abort the request in the llm engine.
+        await self.engine.abort(request.request_id)
+
+        # Send confirmation to the client.
+        await self.socket.send_multipart([
+            identity,
+            cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
+        ])
+
+    async def generate(self, identity, generate_request: RPCGenerateRequest):
+        try:
+            results_generator = self.engine.generate(
+                generate_request.inputs,
+                sampling_params=generate_request.sampling_params,
+                request_id=generate_request.request_id,
+                lora_request=generate_request.lora_request,
+                trace_headers=generate_request.trace_headers,
+                prompt_adapter_request=generate_request.prompt_adapter_request)
+
+            async for request_output in results_generator:
+                await self.socket.send_multipart(
+                    [identity, cloudpickle.dumps(request_output)])
+
+        except Exception as e:
+            ### Notify client of all failures
+            await self.socket.send_multipart([identity, cloudpickle.dumps(e)])
+
+    async def check_health(self, identity):
+        try:
+            await self.engine.check_health()
+            await self.socket.send_multipart(
+                [identity, cloudpickle.dumps(VLLM_RPC_HEALTHY_STR)])
+        except Exception as e:
+            await self.socket.send_multipart([identity, cloudpickle.dumps(e)])
+
+    def _make_handler_coro(self, identity,
+                           message) -> Coroutine[Any, Any, Never]:
+        """Route the zmq message to the handler coroutine."""
+
+        request = cloudpickle.loads(message)
+
+        if isinstance(request, RPCGenerateRequest):
+            return self.generate(identity, request)
+
+        elif isinstance(request, RPCAbortRequest):
+            return self.abort(identity, request)
+
+        elif isinstance(request, RPCUtilityRequest):
+            if request == RPCUtilityRequest.GET_MODEL_CONFIG:
+                return self.get_model_config(identity)
+            elif request == RPCUtilityRequest.GET_PARALLEL_CONFIG:
+                return self.get_parallel_config(identity)
+            elif request == RPCUtilityRequest.GET_DECODING_CONFIG:
+                return self.get_decoding_config(identity)
+            elif request == RPCUtilityRequest.GET_SCHEDULER_CONFIG:
+                return self.get_scheduler_config(identity)
+            elif request == RPCUtilityRequest.GET_LORA_CONFIG:
+                return self.get_lora_config(identity)
+            elif request == RPCUtilityRequest.DO_LOG_STATS:
+                return self.do_log_stats(identity)
+            elif request == RPCUtilityRequest.IS_SERVER_READY:
+                return self.is_server_ready(identity)
+            elif request == RPCUtilityRequest.CHECK_HEALTH:
+                return self.check_health(identity)
+            elif request == RPCUtilityRequest.IS_TRACING_ENABLED:
+                return self.is_tracing_enabled(identity)
+            else:
+                raise ValueError(f"Unknown RPCUtilityRequest type: {request}")
+
+        else:
+            raise ValueError(f"Unknown RPCRequest type: {request}")
+
+    async def run_server_loop(self):
+        """Inner RPC Server Loop"""
+
+        running_tasks = set()
+        while True:
+            # Wait for a request.
+            identity, message = await self.socket.recv_multipart()
+
+            # Process the request async.
+            task = asyncio.create_task(
+                self._make_handler_coro(identity, message))
+
+            # We need to keep around a strong reference to the task,
+            # to avoid the task disappearing mid-execution as running tasks
+            # can be GC'ed. Below is a common "fire-and-forget" tasks
+            # https://docs.python.org/3/library/asyncio-task.html#asyncio.create_task
+            running_tasks.add(task)
+            task.add_done_callback(running_tasks.discard)
+
+
+async def run_server(server: AsyncEngineRPCServer):
+    # Put the server task into the asyncio loop.
+    loop = asyncio.get_running_loop()
+    server_task = loop.create_task(server.run_server_loop())
+
+    # Interruption handling.
+    def signal_handler() -> None:
+        # Kill the server on interrupt / terminate
+        server_task.cancel()
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+    except asyncio.CancelledError:
+        logger.info("vLLM ZMQ RPC Server was interrupted.")
+    finally:
+        # Clean up all resources.
+        server.cleanup()
+
+
+def run_rpc_server(async_engine_args: AsyncEngineArgs,
+                   usage_context: UsageContext, port: int):
+    server = AsyncEngineRPCServer(async_engine_args, usage_context, port)
+    asyncio.run(run_server(server))
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index c832cf2a24b50..ebb1d57fbb9a6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -8,7 +8,7 @@
 from transformers import PreTrainedTokenizer
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
                                          load_chat_template,
                                          parse_chat_message_content)
@@ -39,7 +39,7 @@ class OpenAIServingChat(OpenAIServing):
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         response_role: str,
@@ -50,7 +50,7 @@ def __init__(
         chat_template: Optional[str],
         return_tokens_as_token_ids: bool = False,
     ):
-        super().__init__(engine=engine,
+        super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
@@ -89,7 +89,8 @@ async def create_chat_completion(
             ) = self._maybe_get_adapters(request)
 
             model_config = self.model_config
-            tokenizer = await self.engine.get_tokenizer(lora_request)
+            tokenizer = await self.async_engine_client.get_tokenizer(
+                lora_request)
 
             conversation: List[ConversationMessage] = []
             mm_futures: List[Awaitable[MultiModalDataDict]] = []
@@ -161,7 +162,8 @@ async def create_chat_completion(
             if mm_data is not None:
                 engine_inputs["multi_modal_data"] = mm_data
 
-            is_tracing_enabled = await self.engine.is_tracing_enabled()
+            is_tracing_enabled = (
+                await self.async_engine_client.is_tracing_enabled())
             trace_headers = None
             if is_tracing_enabled and raw_request:
                 trace_headers = extract_trace_headers(raw_request.headers)
@@ -169,7 +171,7 @@ async def create_chat_completion(
                     and contains_trace_headers(raw_request.headers)):
                 log_tracing_disabled_warning()
 
-            result_generator = self.engine.generate(
+            result_generator = self.async_engine_client.generate(
                 engine_inputs,
                 sampling_params,
                 request_id,
@@ -441,7 +443,7 @@ async def chat_completion_full_generator(
         async for res in result_generator:
             if raw_request is not None and await raw_request.is_disconnected():
                 # Abort the request if the client disconnects.
-                await self.engine.abort(request_id)
+                await self.async_engine_client.abort(request_id)
                 return self.create_error_response("Client disconnected")
             final_res = res
         assert final_res is not None
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 7765c5903f341..edc83d83fbba7 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -8,7 +8,7 @@
 from transformers import PreTrainedTokenizer
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -42,7 +42,7 @@ class OpenAIServingCompletion(OpenAIServing):
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         *,
@@ -51,7 +51,7 @@ def __init__(
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
     ):
-        super().__init__(engine=engine,
+        super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
@@ -91,7 +91,8 @@ async def create_completion(self, request: CompletionRequest,
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            tokenizer = await self.engine.get_tokenizer(lora_request)
+            tokenizer = await self.async_engine_client.get_tokenizer(
+                lora_request)
 
             guided_decode_logits_processor = (
                 await self._guided_decode_logits_processor(request, tokenizer))
@@ -119,7 +120,8 @@ async def create_completion(self, request: CompletionRequest,
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                is_tracing_enabled = await self.engine.is_tracing_enabled()
+                is_tracing_enabled = (
+                    await self.async_engine_client.is_tracing_enabled())
                 trace_headers = None
                 if is_tracing_enabled:
                     trace_headers = extract_trace_headers(raw_request.headers)
@@ -127,7 +129,7 @@ async def create_completion(self, request: CompletionRequest,
                         raw_request.headers):
                     log_tracing_disabled_warning()
 
-                generator = self.engine.generate(
+                generator = self.async_engine_client.generate(
                     {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
                     sampling_params,
                     request_id_item,
@@ -168,7 +170,7 @@ async def create_completion(self, request: CompletionRequest,
             async for i, res in result_generator:
                 if await raw_request.is_disconnected():
                     # Abort the request if the client disconnects.
-                    await self.engine.abort(f"{request_id}-{i}")
+                    await self.async_engine_client.abort(f"{request_id}-{i}")
                     return self.create_error_response("Client disconnected")
                 final_res_batch[i] = res
 
@@ -230,7 +232,8 @@ async def completion_stream_generator(
 
                 # Abort the request if the client disconnects.
                 if await raw_request.is_disconnected():
-                    await self.engine.abort(f"{request_id}-{prompt_idx}")
+                    await self.async_engine_client.abort(
+                        f"{request_id}-{prompt_idx}")
                     raise StopAsyncIteration()
 
                 for output in res.outputs:
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index bccc90894e79f..e61c82f9a8a6c 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -6,7 +6,7 @@
 from fastapi import Request
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (EmbeddingRequest,
                                               EmbeddingResponse,
@@ -56,13 +56,13 @@ class OpenAIServingEmbedding(OpenAIServing):
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         *,
         request_logger: Optional[RequestLogger],
     ):
-        super().__init__(engine=engine,
+        super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=None,
@@ -99,7 +99,8 @@ async def create_embedding(self, request: EmbeddingRequest,
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            tokenizer = await self.engine.get_tokenizer(lora_request)
+            tokenizer = await self.async_engine_client.get_tokenizer(
+                lora_request)
 
             pooling_params = request.to_pooling_params()
 
@@ -124,7 +125,7 @@ async def create_embedding(self, request: EmbeddingRequest,
                         "Prompt adapter is not supported "
                         "for embedding models")
 
-                generator = self.engine.encode(
+                generator = self.async_engine_client.encode(
                     {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
                     pooling_params,
                     request_id_item,
@@ -146,7 +147,7 @@ async def create_embedding(self, request: EmbeddingRequest,
             async for i, res in result_generator:
                 if await raw_request.is_disconnected():
                     # Abort the request if the client disconnects.
-                    await self.engine.abort(f"{request_id}-{i}")
+                    await self.async_engine_client.abort(f"{request_id}-{i}")
                     return self.create_error_response("Client disconnected")
                 final_res_batch[i] = res
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8c7929a12e9a0..df4932d8fe185 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -8,7 +8,7 @@
 from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -61,7 +61,7 @@ class OpenAIServing:
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         *,
@@ -72,7 +72,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.engine = engine
+        self.async_engine_client = async_engine_client
         self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
@@ -155,7 +155,7 @@ def create_streaming_error_response(
     async def _guided_decode_logits_processor(
             self, request: Union[ChatCompletionRequest, CompletionRequest],
             tokenizer: AnyTokenizer) -> Optional[LogitsProcessor]:
-        decoding_config = await self.engine.get_decoding_config()
+        decoding_config = await self.async_engine_client.get_decoding_config()
         guided_decoding_backend = request.guided_decoding_backend \
             or decoding_config.guided_decoding_backend
         return await get_guided_decoding_logits_processor(
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 94e1b03ed4036..c4350881a27a6 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,9 +1,9 @@
 from typing import List, Optional, Union
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
 # yapf conflicts with isort for this block
 # yapf: disable
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
                                          load_chat_template,
                                          parse_chat_message_content)
@@ -24,7 +24,7 @@ class OpenAIServingTokenization(OpenAIServing):
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         *,
@@ -32,7 +32,7 @@ def __init__(
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
     ):
-        super().__init__(engine=engine,
+        super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
@@ -57,7 +57,7 @@ async def create_tokenize(
             prompt_adapter_request,
         ) = self._maybe_get_adapters(request)
 
-        tokenizer = await self.engine.get_tokenizer(lora_request)
+        tokenizer = await self.async_engine_client.get_tokenizer(lora_request)
 
         if isinstance(request, TokenizeChatRequest):
             model_config = self.model_config
@@ -113,7 +113,7 @@ async def create_detokenize(
             prompt_adapter_request,
         ) = self._maybe_get_adapters(request)
 
-        tokenizer = await self.engine.get_tokenizer(lora_request)
+        tokenizer = await self.async_engine_client.get_tokenizer(lora_request)
 
         self._log_inputs(request_id,
                          request.tokens,
diff --git a/vllm/envs.py b/vllm/envs.py
index 595058bcbb027..a78bad6a2b273 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -4,6 +4,7 @@
 if TYPE_CHECKING:
     VLLM_HOST_IP: str = ""
     VLLM_PORT: Optional[int] = None
+    VLLM_RPC_PORT: int = 5570
     VLLM_USE_MODELSCOPE: bool = False
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
     VLLM_INSTANCE_ID: Optional[str] = None
@@ -140,6 +141,11 @@ def get_default_config_root():
     lambda: int(os.getenv('VLLM_PORT', '0'))
     if 'VLLM_PORT' in os.environ else None,
 
+    # used when the frontend api server is running in multi-processing mode,
+    # to communicate with the backend engine process over ZMQ.
+    'VLLM_RPC_PORT':
+    lambda: int(os.getenv('VLLM_PORT', '5570')),
+
     # If true, will load models from ModelScope instead of Hugging Face Hub.
     # note that the value is true or false, not numbers
     "VLLM_USE_MODELSCOPE":
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 1c8f6cccb3e9a..554dcc0ed43ed 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,6 +21,8 @@
 from typing import Callable, DefaultDict, Dict, List, Union
 
 import torch
+from lark import Lark
+from outlines import grammars
 from outlines.caching import cache
 from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
 from outlines.fsm.json_schema import build_regex_from_schema
@@ -44,6 +46,23 @@ def __call__(self, input_ids: List[int],
             last_seq_id = hash(tuple(input_ids[:-1]))
             self._fsm_state[seq_id] = self._guide.get_next_state(
                 state=self._fsm_state[last_seq_id], token_id=last_token)
+        else:
+            # Note: this is a hack.
+            # Lark pickling does not work properly (silent failure),
+            # which breaks the RPC (which uses python pickleing).
+            # We need to find a better solution.
+            # On the first time this is called, we simply re-create
+            # the Lark object.
+            if isinstance(self._guide, CFGGuide):
+                self._guide.parser = Lark(
+                    self._guide.cfg_string,
+                    parser="lalr",
+                    lexer="contextual",
+                    propagate_positions=False,
+                    maybe_placeholders=False,
+                    regex=True,
+                    import_paths=[grammars.GRAMMAR_PATH],
+                )
 
         instruction = self._guide.get_next_instruction(
             state=self._fsm_state[seq_id])
diff --git a/vllm/tracing.py b/vllm/tracing.py
index dc8377f2396f2..7ac38e6a0f663 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -60,7 +60,7 @@ def get_span_exporter(endpoint):
             OTLPSpanExporter)
     elif protocol == "http/protobuf":
         from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
-            OTLPSpanExporter)
+            OTLPSpanExporter)  # type: ignore
     else:
         raise ValueError(
             f"Unsupported OTLP protocol '{protocol}' is configured")
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 7a0436dd1fb16..eeab19899b022 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,6 +1,7 @@
 from typing import Optional, Type
 
-from vllm.config import TokenizerPoolConfig
+from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
+                         TokenizerPoolConfig)
 from vllm.executor.ray_utils import ray
 
 from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
@@ -13,6 +14,22 @@
     RayTokenizerGroupPool = None  # type: ignore
 
 
+def init_tokenizer_from_configs(model_config: ModelConfig,
+                                scheduler_config: SchedulerConfig,
+                                parallel_config: ParallelConfig,
+                                enable_lora: bool):
+    init_kwargs = dict(tokenizer_id=model_config.tokenizer,
+                       enable_lora=enable_lora,
+                       max_num_seqs=scheduler_config.max_num_seqs,
+                       max_input_length=None,
+                       tokenizer_mode=model_config.tokenizer_mode,
+                       trust_remote_code=model_config.trust_remote_code,
+                       revision=model_config.tokenizer_revision)
+
+    return get_tokenizer_group(parallel_config.tokenizer_pool_config,
+                               **init_kwargs)
+
+
 def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
                         **init_kwargs) -> BaseTokenizerGroup:
     tokenizer_cls: Type[BaseTokenizerGroup]
diff --git a/vllm/utils.py b/vllm/utils.py
index c4c17bfbefc65..51bd72977a226 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -290,6 +290,10 @@ def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
     return _async_wrapper
 
 
+class ProducerFinished:
+    pass
+
+
 def merge_async_iterators(
         *iterators: AsyncIterator[T]) -> AsyncIterator[Tuple[int, T]]:
     """Merge multiple asynchronous iterators into a single iterator.
@@ -298,9 +302,10 @@ def merge_async_iterators(
     When it yields, it yields a tuple (i, item) where i is the index of the
     iterator that yields the item.
     """
-    queue: asyncio.Queue[Union[Tuple[int, T], Exception]] = asyncio.Queue()
+    queue: asyncio.Queue[Union[Tuple[int, T], ProducerFinished,
+                               Exception]] = asyncio.Queue()
 
-    finished = [False] * len(iterators)
+    producers = len(iterators)
 
     async def producer(i: int, iterator: AsyncIterator[T]):
         try:
@@ -308,7 +313,8 @@ async def producer(i: int, iterator: AsyncIterator[T]):
                 await queue.put((i, item))
         except Exception as e:
             await queue.put(e)
-        finished[i] = True
+        # Signal to the consumer that we've finished
+        await queue.put(ProducerFinished())
 
     _tasks = [
         asyncio.create_task(producer(i, iterator))
@@ -316,9 +322,17 @@ async def producer(i: int, iterator: AsyncIterator[T]):
     ]
 
     async def consumer():
+        remaining = producers
         try:
-            while not all(finished) or not queue.empty():
+            while remaining or not queue.empty():
+                # we think there is a race condition here
                 item = await queue.get()
+
+                if isinstance(item, ProducerFinished):
+                    # Signal that a producer finished- not a real item
+                    remaining -= 1
+                    continue
+
                 if isinstance(item, Exception):
                     raise item
                 yield item
@@ -374,8 +388,10 @@ def get_distributed_init_method(ip: str, port: int) -> str:
     return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
 
 
-def get_open_port() -> int:
-    port = envs.VLLM_PORT
+def get_open_port(port: Optional[int] = None) -> int:
+    if port is None:
+        # Default behavior here is to return a port for multi-gpu communication
+        port = envs.VLLM_PORT
     if port is not None:
         while True:
             try:

From 69ea15e5cc823b2bc040921ce516807fb7357dd1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 21:05:16 -0700
Subject: [PATCH 144/167] [ci][distributed] shorten wait time if server hangs
 (#7098)

---
 tests/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils.py b/tests/utils.py
index dd8af8e3afe70..974fece49f4b4 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -50,7 +50,7 @@ def _nvml():
 
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
-    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
+    MAX_SERVER_START_WAIT_S = 120  # wait for server to start for 120 seconds
 
     def __init__(
         self,

From 8c025fa7030350a81bfeb665c99ad622667bdac0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 3 Aug 2024 12:31:27 +0800
Subject: [PATCH 145/167] [Frontend] Factor out chat message parsing (#7055)

---
 vllm/entrypoints/chat_utils.py                | 28 +++++++++++++++----
 vllm/entrypoints/openai/serving_chat.py       | 17 ++++-------
 .../openai/serving_tokenization.py            | 21 +++++++-------
 3 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index fbb7f70b55e16..072450a6146ee 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,7 +1,8 @@
 import codecs
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from functools import lru_cache
-from typing import Awaitable, Iterable, List, Optional, Union, cast, final
+from typing import (Awaitable, Iterable, List, Optional, Tuple, Union, cast,
+                    final)
 
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -65,8 +66,7 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
-    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
-        default_factory=list)
+    mm_futures: List[Awaitable[MultiModalDataDict]]
 
 
 def load_chat_template(chat_template: Optional[str]) -> Optional[str]:
@@ -174,7 +174,7 @@ def _parse_chat_message_content_parts(
     return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
 
 
-def parse_chat_message_content(
+def _parse_chat_message_content(
     message: ChatCompletionMessageParam,
     model_config: ModelConfig,
     tokenizer: PreTrainedTokenizer,
@@ -190,3 +190,21 @@ def parse_chat_message_content(
 
     return _parse_chat_message_content_parts(role, content, model_config,
                                              tokenizer)
+
+
+def parse_chat_messages(
+    messages: List[ChatCompletionMessageParam],
+    model_config: ModelConfig,
+    tokenizer: PreTrainedTokenizer,
+) -> Tuple[List[ConversationMessage], List[Awaitable[MultiModalDataDict]]]:
+    conversation: List[ConversationMessage] = []
+    mm_futures: List[Awaitable[MultiModalDataDict]] = []
+
+    for msg in messages:
+        parse_result = _parse_chat_message_content(msg, model_config,
+                                                   tokenizer)
+
+        conversation.extend(parse_result.messages)
+        mm_futures.extend(parse_result.mm_futures)
+
+    return conversation, mm_futures
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index ebb1d57fbb9a6..d215754993e82 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,6 +1,5 @@
 import time
-from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, List,
-                    Optional)
+from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
@@ -11,7 +10,7 @@
 from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
                                          load_chat_template,
-                                         parse_chat_message_content)
+                                         parse_chat_messages)
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -92,15 +91,8 @@ async def create_chat_completion(
             tokenizer = await self.async_engine_client.get_tokenizer(
                 lora_request)
 
-            conversation: List[ConversationMessage] = []
-            mm_futures: List[Awaitable[MultiModalDataDict]] = []
-
-            for msg in request.messages:
-                chat_parsed_result = parse_chat_message_content(
-                    msg, model_config, tokenizer)
-
-                conversation.extend(chat_parsed_result.messages)
-                mm_futures.extend(chat_parsed_result.mm_futures)
+            conversation, mm_futures = parse_chat_messages(
+                request.messages, model_config, tokenizer)
 
             tool_dicts = None if request.tools is None else [
                 tool.model_dump() for tool in request.tools
@@ -115,6 +107,7 @@ async def create_chat_completion(
                 chat_template=request.chat_template or self.chat_template,
                 **(request.chat_template_kwargs or {}),
             )
+            assert isinstance(prompt, str)
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index c4350881a27a6..5b6b979b9b9e7 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,13 +1,11 @@
 from typing import List, Optional, Union
 
 from vllm.config import ModelConfig
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.engine.protocol import AsyncEngineClient
-from vllm.entrypoints.chat_utils import (ConversationMessage,
-                                         load_chat_template,
-                                         parse_chat_message_content)
+from vllm.entrypoints.chat_utils import load_chat_template, parse_chat_messages
 from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
                                               DetokenizeResponse,
                                               ErrorResponse,
@@ -17,8 +15,11 @@
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing)
+from vllm.logger import init_logger
 from vllm.utils import random_uuid
 
+logger = init_logger(__name__)
+
 
 class OpenAIServingTokenization(OpenAIServing):
 
@@ -62,12 +63,12 @@ async def create_tokenize(
         if isinstance(request, TokenizeChatRequest):
             model_config = self.model_config
 
-            conversation: List[ConversationMessage] = []
+            conversation, mm_futures = parse_chat_messages(
+                request.messages, model_config, tokenizer)
 
-            for message in request.messages:
-                result = parse_chat_message_content(message, model_config,
-                                                    tokenizer)
-                conversation.extend(result.messages)
+            if mm_futures:
+                logger.warning(
+                    "Multi-modal inputs are ignored during tokenization")
 
             prompt = tokenizer.apply_chat_template(
                 add_generation_prompt=request.add_generation_prompt,

From 04e55834254bf11770d544bbeebdbdb7731d9bbd Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 21:33:53 -0700
Subject: [PATCH 146/167] [ci][distributed] merge distributed test commands
 (#7097)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .buildkite/test-pipeline.yaml                 | 27 ++-------
 .../test_basic_distributed_correctness.py     | 50 ++++++++++------
 .../test_chunked_prefill_distributed.py       | 35 +++++-------
 .../distributed/test_multimodal_broadcast.py  | 57 +++++++++----------
 4 files changed, 78 insertions(+), 91 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 573c3740f0bbb..93b3e3fe91663 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -82,20 +82,9 @@ steps:
   num_gpus: 2
   commands:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
@@ -107,11 +96,6 @@ steps:
   fast_check: true
   commands:
   - pytest -v -s distributed/test_pynccl.py
-  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
-  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
 - label: Pipeline Parallelism Test
@@ -279,9 +263,6 @@ steps:
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 7a0e5673b2cc4..1de2ebab22db4 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -1,15 +1,10 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
-vLLM will allocate all the available memory, so we need to run the tests one
-by one. The solution is to pass arguments (model name) by environment
-variables.
+
 Run:
 ```sh
 cd $VLLM_PATH/tests
 
-TEST_DIST_MODEL=facebook/opt-125m pytest \
-    distributed/test_basic_distributed_correctness.py
-TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
-    distributed/test_basic_distributed_correctness.py
+pytest distributed/test_basic_distributed_correctness.py
 ```
 """
 import os
@@ -19,27 +14,48 @@
 from vllm.utils import cuda_device_count_stateless
 
 from ..models.utils import check_outputs_equal
+from ..utils import fork_new_process_for_each_test
 
-MODELS = [
-    os.environ["TEST_DIST_MODEL"],
-]
-DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 
 
 @pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, test_suite", [
+        ("facebook/opt-125m", "ray", "", "L4"),
+        ("facebook/opt-125m", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("facebook/opt-125m", "ray", "", "A100"),
+        ("facebook/opt-125m", "mp", "", "A100"),
+        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    ])
+@fork_new_process_for_each_test
 def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model: str,
-    dtype: str,
-    max_tokens: int,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
 ) -> None:
-    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
+
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+
+    if attention_backend:
+        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+
+    dtype = "half"
+    max_tokens = 5
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index 1ef085b933793..10921a3852f81 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -1,46 +1,39 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
-vLLM will allocate all the available memory, so we need to run the tests one
-by one. The solution is to pass arguments (model name) by environment
-variables.
 
 Run:
 ```sh
-TEST_DIST_MODEL=facebook/opt-125m pytest \
-    test_chunked_prefill_distributed.py
-TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
-    test_chunked_prefill_distributed.py
+pytest test_chunked_prefill_distributed.py
 ```
 """
-import os
 
 import pytest
 
 from vllm.utils import cuda_device_count_stateless
 
 from ..models.utils import check_outputs_equal
-
-MODELS = [
-    os.environ["TEST_DIST_MODEL"],
-]
-DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
+from ..utils import fork_new_process_for_each_test
 
 
 @pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+@pytest.mark.parametrize("model, distributed_executor_backend", [
+    ("facebook/opt-125m", "ray"),
+    ("meta-llama/Llama-2-7b-hf", "ray"),
+    ("facebook/opt-125m", "mp"),
+    ("meta-llama/Llama-2-7b-hf", "mp"),
+])
+@fork_new_process_for_each_test
 def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
+    distributed_executor_backend: str,
 ) -> None:
-    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
+
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
 
     # Add a chunked prefill config.
     max_num_seqs = min(chunked_prefill_token_size, 256)
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
index a99917f586949..2c96358e2e6f2 100644
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -1,44 +1,41 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
-The second test will hang if more than one test is run per command, so we need
-to run the tests one by one. The solution is to pass arguments (model name) by
-environment variables.
 
 Run:
 ```sh
-TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
-    test_multimodal_broadcast.py
-TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
-    test_multimodal_broadcast.py
+pytest -s -v test_multimodal_broadcast.py
 ```
 """
-import os
 
 import pytest
 
 from vllm.utils import cuda_device_count_stateless
 
-model = os.environ["TEST_DIST_MODEL"]
-
-if model.startswith("llava-hf/llava-1.5"):
-    from ..models.test_llava import models, run_test
-elif model.startswith("llava-hf/llava-v1.6"):
-    from ..models.test_llava_next import models, run_test
-else:
-    raise NotImplementedError(f"Unsupported model: {model}")
-
-
-@pytest.mark.parametrize("tensor_parallel_size", [2])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets,
-                tensor_parallel_size: int, dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
-    if cuda_device_count_stateless() < tensor_parallel_size:
-        pytest.skip(
-            f"Need at least {tensor_parallel_size} GPUs to run the test.")
-
-    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+from ..utils import fork_new_process_for_each_test
+
+
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("model, distributed_executor_backend", [
+    ("llava-hf/llava-1.5-7b-hf", "ray"),
+    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
+    ("llava-hf/llava-1.5-7b-hf", "mp"),
+    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
+])
+@fork_new_process_for_each_test
+def test_models(hf_runner, vllm_runner, image_assets, model: str,
+                distributed_executor_backend: str) -> None:
+
+    dtype = "half"
+    max_tokens = 5
+    num_logprobs = 5
+    tensor_parallel_size = 2
+
+    if model.startswith("llava-hf/llava-1.5"):
+        from ..models.test_llava import models, run_test
+    elif model.startswith("llava-hf/llava-v1.6"):
+        from ..models.test_llava_next import models, run_test
+    else:
+        raise NotImplementedError(f"Unsupported model: {model}")
 
     run_test(
         hf_runner,

From a0d164567cd2a82d827c81a49a21e3f2c75a522d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 22:32:04 -0700
Subject: [PATCH 147/167] [ci][distributed] disable ray dag tests (#7099)

---
 tests/distributed/test_pipeline_parallel.py | 43 +++++++++------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index ab325e0966929..8eb5ca9461c75 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -14,36 +14,29 @@
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
 
-@pytest.mark.parametrize(
-    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
-     "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [
-         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-     ])
+@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
+                          "MODEL_NAME, DIST_BACKEND"),
+                         [
+                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+                         ])
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
-                    DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL):
+                    DIST_BACKEND):
     if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
         pytest.skip("Skipping multi-node pipeline parallel test for "
                     "multiprocessing distributed backend")
 
+    USE_RAY_ADAG_NCCL = 0
+    USE_RAY_ADAG = 0
+
     pp_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",

From 0c25435daa0a399460a676e7c9b604bd23ea2d22 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sat, 3 Aug 2024 13:36:14 +0800
Subject: [PATCH 148/167] [Model] Refactor and decouple weight loading logic
 for InternVL2 model (#7067)

---
 vllm/model_executor/models/intern_vit.py | 11 +++-
 vllm/model_executor/models/internvl.py   | 82 ++++++++----------------
 2 files changed, 38 insertions(+), 55 deletions(-)

diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index c6c692deca2e1..54c933e3e4959 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -4,7 +4,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-from typing import Optional
+from typing import Iterable, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -16,6 +16,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 NORM2FN = {
     'rms_norm': RMSNorm,
@@ -268,3 +269,11 @@ def forward(
         encoder_outputs = self.encoder(inputs_embeds=hidden_states)
 
         return encoder_outputs
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index eabc283b1efdb..4749251271487 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -4,6 +4,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+import itertools
 from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
@@ -414,58 +415,31 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-            (".gate_up_proj", ".w1", 0),
-            (".gate_up_proj", ".w3", 1),
-        ]
-        params_dict = dict(self.named_parameters())
+    def _filter_weights(self, weights: Iterable[Tuple[str, torch.Tensor]],
+                        prefix: str):
         for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if self.config.text_config.tie_word_embeddings \
-                and "lm_head.weight" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                # We only do sharding for language model
-                # and not vision model for now.
-                if "vision_embed_tokens" in name and self.vision_embed_tokens:
-                    continue
-                if weight_name not in name:
-                    continue
-                param = params_dict[name.replace(weight_name, param_name)]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                if "wqkv" in name:
-                    config = self.config.text_config
-                    kv_groups = (config.num_attention_heads //
-                                 config.num_key_value_heads)
-                    head_dim = config.hidden_size // config.num_attention_heads
-                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
-                                                       head_dim,
-                                                       loaded_weight.shape[-1])
-                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
-                                             dim=1)
-                    wq = wq.reshape(-1, wq.shape[-1])
-                    wk = wk.reshape(-1, wk.shape[-1])
-                    wv = wv.reshape(-1, wv.shape[-1])
-                    weight_loader = param.weight_loader
-                    weight_loader(param, wq, 'q')
-                    weight_loader(param, wk, 'k')
-                    weight_loader(param, wv, 'v')
-                    continue
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+            name = name.split(".")
+            if prefix == name.pop(0):
+                name = ".".join(name)
+                yield name, loaded_weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # prepare weight iterators for components
+        vit_weights, mlp_weights, llm_weights = itertools.tee(weights, 3)
+
+        # load vision encoder
+        vit_weights = self._filter_weights(vit_weights, "vision_model")
+        self.vision_model.load_weights(vit_weights)
+
+        # load mlp projector
+        mlp_weights = self._filter_weights(mlp_weights, "mlp1")
+        mlp_params_dict = dict(self.mlp1.named_parameters())
+        for name, loaded_weight in mlp_weights:
+            param = mlp_params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        # load llm backbone
+        llm_weights = self._filter_weights(llm_weights, "language_model")
+        self.language_model.load_weights(llm_weights)

From fb2c1c86c196aa1531435d0c445fbea4c9dd4aa5 Mon Sep 17 00:00:00 2001
From: Zach Zheng <zach.zheng96@gmail.com>
Date: Fri, 2 Aug 2024 22:38:15 -0700
Subject: [PATCH 149/167] [Bugfix] Fix block table for seqs that have prefix
 cache hits (#7018)

---
 tests/prefix_caching/test_prefix_caching.py | 56 +++++++++++++++++++++
 vllm/attention/backends/flash_attn.py       | 12 +++--
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 7985001d34eb1..9821dbd066a59 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -6,10 +6,17 @@
 
 import pytest
 
+from tests.kernels.utils import override_backend_env_variable
 from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager_v1 import CachedBlockAllocator
 from vllm.utils import Device
 
+from ..models.utils import check_outputs_equal
+
+MODELS = [
+    "facebook/opt-125m",
+]
+
 
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("num_blocks", [16])
@@ -76,3 +83,52 @@ def test_eviction(num_blocks: int, ):
     assert (realloc_block != new_block)
     assert (new_block.block_hash == new_block_hash)
     assert (new_block.block_number == 2)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("cached_position", [0, 1])
+@pytest.mark.parametrize("use_v2_block_manager", [False, True])
+def test_mixed_requests(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    backend: str,
+    dtype: str,
+    max_tokens: int,
+    cached_position: int,
+    use_v2_block_manager: bool,
+    monkeypatch,
+) -> None:
+    """
+    Test the case when some sequences have the prefix cache hit
+    and the others don't. The cached position determines where 
+    the sequence is at among the batch of prefills.
+    """
+    override_backend_env_variable(monkeypatch, backend)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    cached_prompt = example_prompts[cached_position]
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enable_prefix_caching=True,
+            use_v2_block_manager=use_v2_block_manager,
+    ) as vllm_model:
+        # Run the first prompt so the cache is populated
+        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
+
+        # Run all the promopts
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 00654dca2adfa..26b3159682b3e 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -209,6 +209,7 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
 
         self.input_builder = input_builder
         self.runner = input_builder.runner
@@ -219,7 +220,7 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
 
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
         """Add a sequence group to the metadata. Specifically update/append
         1. context length.
         2. block table.
@@ -252,7 +253,7 @@ def _add_seq_group(
             # only allowing multiple of block_size chunk size.
             # NOTE: This only works for oooooooxxx style attention.
             block_table = []
-            if inter_data.prefix_cache_hit:
+            if prefix_cache_hit:
                 # NOTE(woosuk): For flash-attn, the block table should
                 # include the entries for the incoming prefill tokens.
                 block_table = block_tables[seq_id]
@@ -281,9 +282,14 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                  -1 if cuda graph is not used.
             batch_size: The maybe padded batch size.
         """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
         for inter_data in self.input_builder.inter_data_list:
             self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled)
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
 
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1

From 99d7cabd7b8b789e837a0682982fd7ec94a843b1 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 3 Aug 2024 13:40:19 +0800
Subject: [PATCH 150/167] [LoRA]  ReplicatedLinear support LoRA (#7081)

---
 tests/lora/test_layers.py | 103 ++++++++++++++++++++++++++++++++++++++
 vllm/lora/layers.py       |  94 ++++++++++++++++++++++++++++++++++
 vllm/lora/utils.py        |   2 +
 3 files changed, 199 insertions(+)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 6f33f56616fcd..d8cc68d5e9599 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -22,6 +22,7 @@
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLora,
                               QKVParallelLinearWithLora,
+                              ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
@@ -31,6 +32,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -545,6 +547,107 @@ def _pretest():
                               atol=atol)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
+def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
+
+    torch.set_default_device(device)
+    punica_wrapper = PunicaWrapper(8192, 256, device)
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_linear_replicated_layer():
+
+        linear = ReplicatedLinear(4096,
+                                  4096,
+                                  bias=False,
+                                  params_dtype=torch.float16)
+        linear.weight.data = torch.rand_like(linear.weight.data)
+        lora_linear = ReplicatedLinearWithLoRA(linear)
+
+        lora_linear.create_lora_weights(max_loras, lora_config)
+
+        return linear, lora_linear
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, lora_linear = create_random_linear_replicated_layer()
+        lora_linear.set_mapping(punica_wrapper)
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results: List[torch.Tensor] = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = linear(input_)[0]
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("orientation", ["row", "column"])
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 3176badabbc7f..42ec99e6ea2c8 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
@@ -262,6 +263,99 @@ def can_replace_layer(
         return type(source_layer) is VocabParallelEmbedding
 
 
+class ReplicatedLinearWithLoRA(BaseLayerWithLoRA):
+
+    def __init__(self, base_layer: ReplicatedLinear) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.input_size = self.base_layer.input_size
+        self.output_size = self.base_layer.output_size
+        self.device = _get_lora_device(self.base_layer)
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        self.lora_config = lora_config
+        lora_a_output_size = lora_config.max_lora_rank
+        self.lora_a_stacked = torch.zeros(
+            max_loras,
+            1,
+            lora_a_output_size,
+            self.input_size,
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            max_loras,
+            1,
+            self.output_size,
+            lora_config.max_lora_rank,
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+
+    def reset_lora(self, index: int):
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+    ):
+        self.reset_lora(index)
+
+        self.lora_a_stacked[index,
+                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                lora_a.T, non_blocking=True)
+        self.lora_b_stacked[index,
+                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                lora_b.T, non_blocking=True)
+
+    def apply(self, x: torch.Tensor,
+              bias: Optional[torch.Tensor]) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
+                                     self.lora_b_stacked, 1.0)
+        return output
+
+    def forward(self, input_):
+        """Forward of ReplicatedLinearWithLoRA
+
+        Args:
+            input_: Tensor whose last dimension is `input_size`.
+
+        Returns:
+            - output
+            - bias
+        """
+        bias = (self.base_layer.bias
+                if not self.base_layer.skip_bias_add else None)
+
+        # Matrix multiply.
+        output = self.apply(input_, bias)
+
+        output_bias = (self.base_layer.bias
+                       if self.base_layer.skip_bias_add else None)
+        return output, output_bias
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is ReplicatedLinear
+
+
 class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 4513337299e16..ee983328e2c5b 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -23,6 +23,7 @@
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLora,
                               QKVParallelLinearWithLora,
+                              ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
@@ -38,6 +39,7 @@
     QKVParallelLinearWithLora,
     MergedQKVParallelLinearWithLora,
     RowParallelLinearWithLoRA,
+    ReplicatedLinearWithLoRA,
     LogitsProcessorWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
     QKVParallelLinearWithShardedLora,

From 67d745cc68d9ad31bf683a88f00a1aee9782f541 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 2 Aug 2024 23:52:44 -0700
Subject: [PATCH 151/167] [CI] Temporarily turn off H100 performance benchmark
 (#7104)

---
 .../benchmark-pipeline.yaml                   | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 02c0ee534d72c..8490c9f1da221 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -42,20 +42,20 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - label: "H100"
-    agents:
-      queue: H100
-    plugins:
-    - docker#v5.11.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
+  # - label: "H100"
+  #   agents:
+  #     queue: H100
+  #   plugins:
+  #   - docker#v5.11.0:
+  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #       command:
+  #       - bash
+  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+  #       mount-buildkite-agent: true
+  #       propagate-environment: true
+  #       ipc: host
+  #       gpus: all
+  #       environment:
+  #       - VLLM_USAGE_SOURCE
+  #       - HF_TOKEN
 

From 44dcb52e39ee6b2c9ef9e6497525e1e183c9d24b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 3 Aug 2024 10:44:53 -0700
Subject: [PATCH 152/167] [ci][test] finalize fork_new_process_for_each_test
 (#7114)

---
 tests/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/utils.py b/tests/utils.py
index 974fece49f4b4..666694299d397 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -360,6 +360,9 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
 
 
 def fork_new_process_for_each_test(f):
+    """Decorator to fork a new process for each test function.
+    See https://github.com/vllm-project/vllm/issues/7053 for more details.
+    """
 
     @functools.wraps(f)
     def wrapper(*args, **kwargs):

From 825b044863a8e3af82a82a80cd2617486cc829ca Mon Sep 17 00:00:00 2001
From: Jeff Fialho <jefferson@fialhocoelho.com.br>
Date: Sat, 3 Aug 2024 20:01:38 -0300
Subject: [PATCH 153/167] [Frontend] Warn if user `max_model_len` is greater
 than derived `max_model_len` (#7080)

Signed-off-by: Jefferson Fialho <jfialho@ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
---
 vllm/config.py | 19 +++++++++++++------
 vllm/envs.py   | 10 ++++++++++
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index ef56e2b6395be..028f4eed8f4a2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -6,6 +6,7 @@
 import torch
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
@@ -1541,15 +1542,21 @@ def _get_and_verify_max_len(
                     "Disabling sliding window is not supported for models "
                     "model_max_length in the config. Please raise an issue "
                     "so we can investigate.")
-            pass
         else:
-            raise ValueError(
+            msg = (
                 f"User-specified max_model_len ({max_model_len}) is greater "
-                "than the derived max_model_len "
-                f"({max_len_key}={derived_max_model_len} or model_max_length="
+                f"than the derived max_model_len ({max_len_key}="
+                f"{derived_max_model_len} or model_max_length="
                 f"{model_max_length} in model's config.json). This may lead "
-                "to incorrect model outputs or CUDA errors. Make sure the "
-                "value is correct and within the model context size.")
+                "to incorrect model outputs or CUDA errors.")
+            if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
+                logger.warning(
+                    "%s Make sure the value is correct and within the "
+                    "model context size.", msg)
+            else:
+                raise ValueError(
+                    f"{msg} To allow overriding this maximum, set "
+                    "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
     return int(max_model_len)
 
 
diff --git a/vllm/envs.py b/vllm/envs.py
index a78bad6a2b273..089a39d8e029d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -50,6 +50,7 @@
     VLLM_NO_DEPRECATION_WARNING: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
 
 
 def get_default_cache_root():
@@ -331,6 +332,15 @@ def get_default_config_root():
     # If set, vllm will skip the deprecation warnings.
     "VLLM_NO_DEPRECATION_WARNING":
     lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
+
+    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
+    # the user to specify a max sequence length greater than
+    # the max length derived from the model's config.json.
+    # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
+    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
+    lambda:
+    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
+     ("1", "true")),
 }
 
 # end-env-vars-definition

From 654bc5ca49bde0969bc95e4b1dbe7fabbb8f631c Mon Sep 17 00:00:00 2001
From: Yihuan Bu <88394319+kevinbu233@users.noreply.github.com>
Date: Sat, 3 Aug 2024 23:12:09 -0400
Subject: [PATCH 154/167] Support for guided decoding for offline LLM (#6878)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/conf.py                           |   1 +
 tests/entrypoints/{openai => }/conftest.py    |  22 ++-
 tests/entrypoints/llm/test_guided_generate.py | 142 ++++++++++++++++++
 vllm/entrypoints/llm.py                       |  44 +++++-
 vllm/entrypoints/openai/protocol.py           |  26 +++-
 .../guided_decoding/__init__.py               |  26 +++-
 .../guided_decoding/guided_fields.py          |  38 +++++
 .../lm_format_enforcer_decoding.py            |  39 +++++
 .../guided_decoding/outlines_decoding.py      |  26 +++-
 9 files changed, 352 insertions(+), 12 deletions(-)
 rename tests/entrypoints/{openai => }/conftest.py (83%)
 create mode 100644 tests/entrypoints/llm/test_guided_generate.py
 create mode 100644 vllm/model_executor/guided_decoding/guided_fields.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1093b30bca11d..f1eb8524d4e9c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -111,6 +111,7 @@ def setup(app):
     "tqdm",
     "tensorizer",
     "pynvml",
+    "outlines",
 ]
 
 for mock_target in autodoc_mock_imports:
diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/conftest.py
similarity index 83%
rename from tests/entrypoints/openai/conftest.py
rename to tests/entrypoints/conftest.py
index 0837644f26bde..e7ef5637c8ccb 100644
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -1,6 +1,26 @@
 import pytest
 
 
+@pytest.fixture
+def sample_prompts():
+    return [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+
+@pytest.fixture
+def sample_token_ids():
+    return [
+        [0],
+        [0, 1],
+        [0, 2, 1],
+        [0, 3, 1, 2],
+    ]
+
+
 @pytest.fixture
 def sample_regex():
     return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
@@ -66,4 +86,4 @@ def sample_sql_statements():
 table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
-""")
\ No newline at end of file
+""")
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
new file mode 100644
index 0000000000000..873e115421257
--- /dev/null
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -0,0 +1,142 @@
+import json
+import re
+import weakref
+
+import jsonschema
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+
+from ...conftest import cleanup
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+        del llm
+    cleanup()
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_regex(sample_regex, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+    )
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example IPv4 address with this regex: {sample_regex}"
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options_request=dict(guided_regex=sample_regex))
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert re.fullmatch(sample_regex, generated_text) is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_json_completion(sample_json_schema, llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+    )
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example JSON for an employee profile "
+            f"that fits this schema: {sample_json_schema}"
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options_request=dict(guided_json=sample_json_schema))
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_choice_completion(sample_guided_choice, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+    )
+    outputs = llm.generate(
+        prompts="The best language for type-safe systems programming is ",
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options_request=dict(guided_choice=sample_guided_choice))
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert generated_text in sample_guided_choice
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_grammar(sample_sql_statements, llm):
+
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+    )
+    outputs = llm.generate(
+        prompts=("Generate a sql state that select col_1 from "
+                 "table_1 where it is equals to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options_request=dict(guided_grammar=sample_sql_statements))
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        # use Lark to parse the output, and make sure it's a valid parse tree
+        from lark import Lark
+        parser = Lark(sample_sql_statements)
+        parser.parse(generated_text)
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 62309ed345b1d..262cba79e5712 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -10,6 +10,9 @@
                          parse_and_batch_prompt)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.guided_decoding import (
+    GuidedDecodingRequest, get_local_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.guided_fields import LLMGuidedOptions
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -262,6 +265,8 @@ def generate(
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None
     ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -303,6 +308,14 @@ def generate(
         else:
             inputs = cast(Union[PromptInputs, Sequence[PromptInputs]], prompts)
 
+        if isinstance(guided_options_request, dict):
+            if len(guided_options_request) > 1:
+                raise ValueError(
+                    "You can only use one guided decoding but multiple is "
+                    f"specified: {guided_options_request}")
+            guided_options_request = GuidedDecodingRequest(
+                **guided_options_request)
+
         if sampling_params is None:
             # Use default sampling params.
             sampling_params = SamplingParams()
@@ -311,7 +324,8 @@ def generate(
             inputs=inputs,
             params=sampling_params,
             lora_request=lora_request,
-            prompt_adapter_request=prompt_adapter_request)
+            prompt_adapter_request=prompt_adapter_request,
+            guided_options=guided_options_request)
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return LLMEngine.validate_outputs(outputs, RequestOutput)
@@ -508,6 +522,7 @@ def _validate_and_add_requests(
                       Sequence[PoolingParams]],
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
         prompt_adapter_request: Optional[PromptAdapterRequest],
+        guided_options: Optional[GuidedDecodingRequest] = None,
     ) -> None:
         if isinstance(inputs, (str, dict)):
             # Convert a single prompt to a list.
@@ -523,6 +538,15 @@ def _validate_and_add_requests(
             raise ValueError("The lengths of prompts and lora_request "
                              "must be the same.")
 
+        if isinstance(params, list):
+            params = [
+                self._add_guided_processor(param, guided_options)
+                if isinstance(param, SamplingParams) else param
+                for param in params
+            ]
+        elif isinstance(params, SamplingParams):
+            params = self._add_guided_processor(params, guided_options)
+
         # Add requests to the engine.
         for i, request_inputs in enumerate(inputs):
             self._add_request(
@@ -548,6 +572,24 @@ def _add_request(
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request)
 
+    def _add_guided_processor(
+            self,
+            params: SamplingParams,
+            guided_options: Optional[GuidedDecodingRequest] = None):
+        if guided_options:
+            if guided_options.guided_decoding_backend is None:
+                decoding_config = self.llm_engine.get_decoding_config()
+                guided_options.guided_decoding_backend = (
+                    decoding_config.guided_decoding_backend)
+            guided_logits_processor = get_local_guided_decoding_logits_processor(  #noqa
+                guided_options.guided_decoding_backend, guided_options,
+                self.get_tokenizer())
+            if guided_logits_processor:
+                if params.logits_processors is None:
+                    params.logits_processors = []
+                params.logits_processors.append(guided_logits_processor)
+        return params
+
     def _run_engine(
             self, *, use_tqdm: bool
     ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 3b35ae1ebd705..76318a1271229 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,6 +1,7 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import time
+from argparse import Namespace
 from typing import Any, Dict, List, Literal, Optional, Union
 
 import torch
@@ -14,6 +15,23 @@
 from vllm.sampling_params import LogitsProcessor, SamplingParams
 from vllm.utils import random_uuid
 
+# torch is mocked during docs generation,
+# so we have to provide the values as literals
+_MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807)
+
+try:
+    from sphinx.ext.autodoc.mock import _MockModule
+
+    if isinstance(torch, _MockModule):
+        _LONG_INFO = _MOCK_LONG_INFO
+    else:
+        _LONG_INFO = torch.iinfo(torch.long)
+except ModuleNotFoundError:
+    _LONG_INFO = torch.iinfo(torch.long)
+
+assert _LONG_INFO.min == _MOCK_LONG_INFO.min
+assert _LONG_INFO.max == _MOCK_LONG_INFO.max
+
 
 class OpenAIBaseModel(BaseModel):
     # OpenAI API does not allow extra fields
@@ -108,9 +126,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     n: Optional[int] = 1
     presence_penalty: Optional[float] = 0.0
     response_format: Optional[ResponseFormat] = None
-    seed: Optional[int] = Field(None,
-                                ge=torch.iinfo(torch.long).min,
-                                le=torch.iinfo(torch.long).max)
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
@@ -327,9 +343,7 @@ class CompletionRequest(OpenAIBaseModel):
     max_tokens: Optional[int] = 16
     n: int = 1
     presence_penalty: Optional[float] = 0.0
-    seed: Optional[int] = Field(None,
-                                ge=torch.iinfo(torch.long).min,
-                                le=torch.iinfo(torch.long).max)
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 50aa3ec379f4a..4a2476dd6314d 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -3,9 +3,10 @@
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionNamedToolChoiceParam, ChatCompletionRequest,
     CompletionRequest)
-from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (
-    get_lm_format_enforcer_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.guided_fields import (
+    GuidedDecodingRequest)
 from vllm.model_executor.guided_decoding.outlines_decoding import (
+    get_local_outlines_guided_decoding_logits_processor,
     get_outlines_guided_decoding_logits_processor)
 from vllm.sampling_params import LogitsProcessor
 
@@ -20,6 +21,8 @@ async def get_guided_decoding_logits_processor(
         return await get_outlines_guided_decoding_logits_processor(
             request, tokenizer)
     if guided_decoding_backend == 'lm-format-enforcer':
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_lm_format_enforcer_guided_decoding_logits_processor)
         return await get_lm_format_enforcer_guided_decoding_logits_processor(
             request, tokenizer)
 
@@ -28,6 +31,25 @@ async def get_guided_decoding_logits_processor(
         "Must be one of 'outlines, 'lm-format-enforcer'")
 
 
+def get_local_guided_decoding_logits_processor(
+        guided_decoding_backend: str, guided_options: GuidedDecodingRequest,
+        tokenizer) -> Optional[LogitsProcessor]:
+    # request = _adapt_request_for_tool_use(request)
+
+    if guided_decoding_backend == 'outlines':
+        return get_local_outlines_guided_decoding_logits_processor(
+            guided_options, tokenizer)
+    if guided_decoding_backend == 'lm-format-enforcer':
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_local_lm_format_enforcer_guided_decoding_logits_processor)
+        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
+            guided_options, tokenizer)
+
+    raise ValueError(
+        f"Unknown guided decoding backend '{guided_decoding_backend}'. "
+        "Must be one of 'outlines, 'lm-format-enforcer'")
+
+
 def _adapt_request_for_tool_use(request: Union[CompletionRequest,
                                                ChatCompletionRequest]):
     # the legacy completion API does not support tool use
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
new file mode 100644
index 0000000000000..3082ac1510ccc
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/guided_fields.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional, TypedDict, Union
+
+from pydantic import BaseModel
+
+
+class LLMGuidedOptions(TypedDict, total=False):
+    guided_json: Union[Dict, BaseModel, str]
+    guided_regex: str
+    guided_choice: List[str]
+    guided_grammar: str
+    guided_decoding_backend: str
+    guided_whitespace_pattern: str
+    guided_json_object: bool
+
+
+@dataclass
+class GuidedDecodingRequest:
+    """One of the fields will be used to retrieve the logit processor."""
+    guided_json: Optional[Union[Dict, BaseModel, str]] = None
+    guided_regex: Optional[str] = None
+    guided_choice: Optional[List[str]] = None
+    guided_grammar: Optional[str] = None
+    guided_decoding_backend: Optional[str] = None
+    guided_whitespace_pattern: Optional[str] = None
+    guided_json_object: Optional[bool] = None
+
+    def __post_init__(self):
+        """Validate that some fields are mutually exclusive."""
+        guide_count = sum([
+            self.guided_json is not None, self.guided_regex is not None,
+            self.guided_choice is not None, self.guided_grammar is not None,
+            self.guided_json_object is not None
+        ])
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding but multiple are "
+                f"specified: {self.__dict__}")
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
index d0a5ca5592f9d..b2188c9cbc2bb 100644
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -12,7 +12,10 @@
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest)
+from vllm.model_executor.guided_decoding.guided_fields import (
+    GuidedDecodingRequest)
 from vllm.model_executor.guided_decoding.outlines_decoding import (
+    get_local_outlines_guided_decoding_logits_processor,
     get_outlines_guided_decoding_logits_processor)
 from vllm.sampling_params import LogitsProcessor
 
@@ -54,6 +57,42 @@ async def get_lm_format_enforcer_guided_decoding_logits_processor(
     return logits_processor
 
 
+def get_local_lm_format_enforcer_guided_decoding_logits_processor(
+        guided_options: GuidedDecodingRequest,
+        tokenizer) -> Optional[LogitsProcessor]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+
+    tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer)
+    character_level_parser: CharacterLevelParser
+    if guided_options.guided_json:
+        schema = _normalize_json_schema_object(guided_options.guided_json)
+        character_level_parser = JsonSchemaParser(schema)
+    elif guided_options.guided_choice:
+        character_level_parser = UnionParser(
+            [StringParser(choice) for choice in guided_options.guided_choice])
+    elif guided_options.guided_regex:
+        character_level_parser = RegexParser(guided_options.guided_regex)
+    elif guided_options.guided_grammar:
+        # CFG grammar not supported by LMFE, revert to outlines
+        return get_local_outlines_guided_decoding_logits_processor(
+            guided_options, tokenizer)
+    elif guided_options.guided_json_object:
+        # None means any json object
+        character_level_parser = JsonSchemaParser(None)
+    else:
+        return None
+
+    logits_processor = build_vllm_logits_processor(tokenizer_data,
+                                                   character_level_parser)
+    return logits_processor
+
+
 def _normalize_json_schema_object(schema: Union[str, dict, BaseModel]) -> dict:
     if isinstance(schema, str):
         return json_loads(schema)
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 721f7e0530cb7..bc62224dabecf 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -10,6 +10,8 @@
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest)
+from vllm.model_executor.guided_decoding.guided_fields import (
+    GuidedDecodingRequest)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
 
@@ -77,8 +79,27 @@ async def get_outlines_guided_decoding_logits_processor(
                                       mode, request.guided_whitespace_pattern)
 
 
+def get_local_outlines_guided_decoding_logits_processor(
+    guided_options: GuidedDecodingRequest, tokenizer: PreTrainedTokenizerBase
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
+           None]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    guide, mode = _get_guide_and_mode(guided_options)
+    if not guide or not mode:
+        return None
+
+    return _get_logits_processor(guide, tokenizer, mode,
+                                 guided_options.guided_whitespace_pattern)
+
+
 def _get_guide_and_mode(
-    request: Union[CompletionRequest, ChatCompletionRequest]
+    request: Union[CompletionRequest, ChatCompletionRequest,
+                   GuidedDecodingRequest]
 ) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
 
     if request.guided_json:
@@ -102,7 +123,8 @@ def _get_guide_and_mode(
         return choices_regex, GuidedDecodingMode.CHOICE
     elif request.guided_grammar:
         return request.guided_grammar, GuidedDecodingMode.GRAMMAR
-    elif (request.response_format is not None
+    elif (not isinstance(request, GuidedDecodingRequest)
+          and request.response_format is not None
           and request.response_format.type == "json_object"):
         return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
     else:

From 9fadc7b7a03f798036d0e8710587870e13bae759 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 3 Aug 2024 22:03:46 -0700
Subject: [PATCH 155/167] [misc] add zmq in collect env (#7119)

---
 collect_env.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/collect_env.py b/collect_env.py
index 083cb768f5399..244e4ddd5aed5 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -65,6 +65,7 @@
     "optree",
     "nccl",
     "transformers",
+    "zmq",
 }
 
 DEFAULT_PIP_PATTERNS = {
@@ -77,6 +78,7 @@
     "onnx",
     "nccl",
     "transformers",
+    "zmq",
 }
 
 

From 83c644fe7ecee05d3ebe5057acb6e008d7e81eb8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 4 Aug 2024 00:22:19 -0700
Subject: [PATCH 156/167] [core][misc] simply output processing with shortcut
 code path (#7117)

---
 vllm/engine/output_processor/single_step.py | 39 ++++++++++++++++-----
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 59eb4bc439d1f..4a46c93f84256 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -81,6 +81,29 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
 
     def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                                         outputs: SequenceGroupOutput) -> None:
+        sampling_params = seq_group.sampling_params
+        if sampling_params.n == 1 and not sampling_params.use_beam_search:
+            # only have one output sample
+            sample = outputs.samples[0]
+            # only have one sequence
+            seq = seq_group.seqs[0]
+            seq.append_token_id(sample.output_token, sample.logprobs)
+            if sampling_params.detokenize and self.detokenizer:
+                new_char_count = self.detokenizer.decode_sequence_inplace(
+                    seq, sampling_params)
+            else:
+                new_char_count = 0
+            self.stop_checker.maybe_stop_sequence(
+                seq,
+                new_char_count,
+                sampling_params,
+                lora_req=seq_group.lora_request,
+            )
+            if seq.is_finished():
+                for scheduler in self.scheduler:
+                    scheduler.free_seq(seq)
+            return
+
         # Process samples
         samples = outputs.samples
         parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
@@ -127,20 +150,20 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             child_seqs.append((parent, parent))
 
         for seq, _ in child_seqs:
-            if seq_group.sampling_params.detokenize and self.detokenizer:
+            if sampling_params.detokenize and self.detokenizer:
                 new_char_count = self.detokenizer.decode_sequence_inplace(
-                    seq, seq_group.sampling_params)
+                    seq, sampling_params)
             else:
                 new_char_count = 0
             self.stop_checker.maybe_stop_sequence(
                 seq,
                 new_char_count,
-                seq_group.sampling_params,
+                sampling_params,
                 lora_req=seq_group.lora_request,
             )
 
         # Non-beam search case
-        if not seq_group.sampling_params.use_beam_search:
+        if not sampling_params.use_beam_search:
             # For newly created child sequences, add them to the sequence group
             # and fork them in block manager if they are not finished.
             for seq, parent in child_seqs:
@@ -164,8 +187,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
         # Select the child sequences to keep in the sequence group.
         selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
         unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
-        beam_width = seq_group.sampling_params.best_of
-        length_penalty = seq_group.sampling_params.length_penalty
+        beam_width = sampling_params.best_of
+        length_penalty = sampling_params.length_penalty
 
         # Select the newly finished sequences with the highest scores
         # to replace existing finished sequences.
@@ -219,8 +242,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             best_running_seq = running_child_seqs[0][0]
             current_worst_seq = all_finished_seqs[beam_width - 1][0]
             stop_beam_search = self._check_beam_search_early_stopping(
-                seq_group.sampling_params.early_stopping,
-                seq_group.sampling_params, best_running_seq, current_worst_seq)
+                sampling_params.early_stopping, sampling_params,
+                best_running_seq, current_worst_seq)
 
         if stop_beam_search:
             # Stop the beam search and remove all the running sequences from

From 179a6a36f2a585df49ce9c26701b1b9d894bd00e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 4 Aug 2024 16:12:41 +0800
Subject: [PATCH 157/167] [Model]Refactor MiniCPMV (#7020)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.rst       |    2 +-
 .../models/idefics2_vision_model.py           |  296 +++++
 vllm/model_executor/models/minicpmv.py        | 1023 ++++++++++-------
 vllm/model_executor/models/na_vit.py          |    2 +-
 4 files changed, 937 insertions(+), 386 deletions(-)
 create mode 100644 vllm/model_executor/models/idefics2_vision_model.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index a1ea366b82b04..fd5d154006ae7 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -220,7 +220,7 @@ Vision Language Models
     - Phi-3-Vision
     - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
     -
-  * - :code:`MiniCPM-V`
+  * - :code:`MiniCPMV`
     - MiniCPM-V
     - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
     -
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
new file mode 100644
index 0000000000000..cc448ed28d2dc
--- /dev/null
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -0,0 +1,296 @@
+# coding=utf-8
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Idefics2 model."""
+
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers.models.idefics2.configuration_idefics2 import (
+    Idefics2Config, Idefics2VisionConfig)
+from xformers import ops as xops
+
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
+    ` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
+    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the
+    need to resize them to the same fixed size. In particular, we start from the
+    original pre-trained SigLIP model(which uses images of fixed-size square
+    images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: Idefics2VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+    ) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
+                                  1 / self.num_patches_per_side)
+        position_ids = torch.full(size=(batch_size,
+                                        max_nb_patches_h * max_nb_patches_w),
+                                  fill_value=0)
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(fractional_coords_h,
+                                              boundaries,
+                                              right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w,
+                                              boundaries,
+                                              right=True)
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
+                       bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics2VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: Idefics2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"  # noqa: E501
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            quant_config=quant_config,
+        )
+        self.out_proj = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        batch_size, q_len, _ = hidden_states.size()
+        qkv, _ = self.qkv_proj(
+            hidden_states
+        )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
+        query_states, key_states, value_states = qkv.chunk(3, dim=-1)
+        query_states = query_states.view(batch_size, q_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+        key_states = key_states.view(batch_size, q_len,
+                                     self.num_heads_per_partition,
+                                     self.head_dim)
+        value_states = value_states.view(batch_size, q_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+        # see: https://facebookresearch.github.io/xformers/components/ops.html
+        out = xops.memory_efficient_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            p=self.dropout,
+            scale=self.scale,
+        )
+        out = out.view(batch_size, q_len, -1)
+        attn_output, _ = self.out_proj(out)
+        return attn_output
+
+
+class Idefics2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2EncoderLayer(nn.Module):
+
+    def __init__(self, config: Idefics2Config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics2VisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Idefics2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention
+    layers. Each layer is a
+    [`Idefics2EncoderLayer`].
+
+    Args:
+        config: Idefics2Config
+    """
+
+    def __init__(self, config: Idefics2Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Idefics2EncoderLayer(config)
+            for _ in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (torch.Tensor):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert
+                `input_ids` indices into associated vectorsthan the model's
+                internal embedding lookup matrix.
+        """
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs
+        return hidden_states
+
+
+class Idefics2VisionTransformer(nn.Module):
+
+    def __init__(self, config: Idefics2VisionConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(config)
+        self.encoder = Idefics2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+    ) -> torch.tensor:
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask)
+        encoder_outputs = self.encoder(hidden_states)
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+        return last_hidden_state
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2a7fe7ba0ebac..095bb49f6ba76 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -24,7 +24,8 @@
 import math
 import re
 from functools import partial
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import (Any, Callable, Iterable, List, Optional, Tuple, TypedDict,
+                    Union)
 
 import numpy as np
 import torch
@@ -38,11 +39,14 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import SupportsVision
 from vllm.model_executor.models.llama import LlamaModel
@@ -54,12 +58,45 @@
                                    cached_get_tokenizer)
 from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
 
+from .idefics2_vision_model import Idefics2VisionTransformer
+
+logger = init_logger(__name__)
+
 _KEYS_TO_MODIFY_MAPPING = {
     "llm.lm_head": "lm_head",
     "llm.model": "llm",
 }
 
 
+class MiniCPMVImagePixelInputs(TypedDict):
+    pixel_values: List[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that the image size may vary, so we pass it as a list
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+    tgt_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+MiniCPMVImageInputs = MiniCPMVImagePixelInputs
+
+DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
+
+
 def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor):
     # abs_pos: L, C
     # tgt_size: (H, W)
@@ -68,23 +105,25 @@ def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor):
     # tgt_size = int(math.sqrt(tgt_size))
     dtype = abs_pos.dtype
 
-    return F.interpolate(
+    return (F.interpolate(
         abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
         size=(tgt_size[0], tgt_size[1]),
         mode="bicubic",
         align_corners=False,
-    ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype))
 
 
 # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
-def get_2d_sincos_pos_embed(embed_dim: int,
-                            grid_size: Union[int, Tuple[int, int]],
-                            cls_token: bool = False,
-                            version: Tuple[int, int] = (2, 0)):
+def get_2d_sincos_pos_embed(
+        embed_dim: int,
+        grid_size: Union[int, Tuple[int, int]],
+        cls_token: bool = False,
+        version: Tuple[int, int] = (2, 0),
+):
     """
     grid_size: int of the grid height and width
     return:
-    pos_embed: [grid_size*grid_size, embed_dim] or 
+    pos_embed: [grid_size*grid_size, embed_dim] or
                 [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
     """
     if isinstance(grid_size, int):
@@ -109,7 +148,7 @@ def get_2d_sincos_pos_embed(embed_dim: int,
 
 
 def get_2d_sincos_pos_embed_from_grid(embed_dim: int,
-                                      grid: Union[int, Tuple[int, int]],
+                                      grid: np.ndarray,
                                       version: Tuple[int, int] = (2, 0)):
     assert embed_dim % 2 == 0
 
@@ -127,7 +166,7 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim: int,
 
 
 def get_1d_sincos_pos_embed_from_grid(embed_dim: int,
-                                      pos: int,
+                                      pos: np.ndarray,
                                       version: Tuple[int, int] = (2, 0)):
     """
     embed_dim: output dimension for each position
@@ -136,24 +175,24 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim: int,
     """
     assert embed_dim % 2 == 0
     omega = np.arange(embed_dim // 2, dtype=np.float32)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000**omega  # (D/2,)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
 
     if version == (2, 0):
         pos = pos.reshape(-1)  # (M,)
-        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
         emb_sin = np.sin(out)  # (M, D/2)
         emb_cos = np.cos(out)  # (M, D/2)
         emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
     else:
-        out = np.einsum('hw,d->hwd', pos, omega)  # (H, W, D/2), outer product
+        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
         emb_sin = np.sin(out)  # (H, W, D/2)
         emb_cos = np.cos(out)  # (H, W, D/2)
         emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
     return emb
 
 
-class Resampler(nn.Module):
+class BaseResampler(nn.Module):
     """
     A 2D perceiver-resampler network with one cross attention layers by
         (grid_size**2) learnable queries and 2d sincos pos_emb
@@ -161,89 +200,151 @@ class Resampler(nn.Module):
         A tensor with the shape of (grid_size**2, embed_dim)
     """
 
-    default_norm_layer = partial(nn.LayerNorm, eps=1e-6)
-
-    def __init__(self,
-                 num_queries: int,
-                 grid_size: int,
-                 embed_dim: int,
-                 num_heads: int,
-                 kv_dim: Optional[int] = None,
-                 norm_layer: nn.Module = default_norm_layer,
-                 adaptive: bool = False,
-                 max_size: Tuple[int, int] = (70, 70),
-                 version: Tuple[int, int] = (2, 0)):
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+    ) -> None:
         super().__init__()
 
-        self.version = version
-        if self.version == (2, 0):
-            self.num_queries = grid_size**2
-        else:
-            self.num_queries = num_queries
-            self.max_size = max_size
+        self.num_queries = num_queries
         self.embed_dim = embed_dim
         self.num_heads = num_heads
-        self.adaptive = adaptive
 
         self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
-        trunc_normal_(self.query, std=.02)
+        trunc_normal_(self.query, std=0.02)
 
         if kv_dim is not None and kv_dim != embed_dim:
-            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+            self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False)
         else:
-            self.kv_proj = nn.Identity()
+            # Maintain the same return value with ReplicatedLinear.forward
+            self.kv_proj = lambda *args, **kwargs: (
+                nn.Identity()(*args, **kwargs),
+                None,
+            )
 
         self.attn = nn.MultiheadAttention(embed_dim, num_heads)
         self.ln_q = norm_layer(embed_dim)
         self.ln_kv = norm_layer(embed_dim)
-
         self.ln_post = norm_layer(embed_dim)
         self.proj = nn.Parameter(
             (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
 
-        if self.version == (2, 0):
-            self.pos_embed = nn.Parameter(
-                torch.from_numpy(
-                    get_2d_sincos_pos_embed(
-                        embed_dim, grid_size,
-                        version=self.version)).float()).requires_grad_(False)
+    def _init_weights(self, m: nn.Module) -> None:
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+class Resampler2(BaseResampler):
+
+    def __init__(
+        self,
+        grid_size: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        adaptive: bool = False,
+    ) -> None:
+        super().__init__(grid_size**2, embed_dim, num_heads, kv_dim,
+                         norm_layer)
+
+        self.adaptive = adaptive
+
+        pos_embed_arr = get_2d_sincos_pos_embed(embed_dim,
+                                                grid_size,
+                                                version=(2, 0))
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(pos_embed_arr).float()).requires_grad_(False)
+
+        self.apply(self._init_weights)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        tgt_sizes: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ):
+        if self.adaptive:
+            pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
+                                                    tgt_sizes,
+                                                    version=(2, 0))
+            pos_embed = torch.from_numpy(pos_embed_arr).to(device=x.device,
+                                                           dtype=x.dtype)
         else:
-            self._set_2d_pos_cache(self.max_size)
+            pos_embed = get_abs_pos(self.pos_embed, tgt_sizes)
+
+        x, _ = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            attn_mask=attn_mask,
+        )[0]
+        x = out.permute(1, 0, 2)
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
+class Resampler2_5(BaseResampler):
+
+    def __init__(
+            self,
+            num_queries: int,
+            embed_dim: int,
+            num_heads: int,
+            kv_dim: Optional[int] = None,
+            norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+            max_size: Tuple[int, int] = (70, 70),
+    ) -> None:
+        super().__init__(num_queries, embed_dim, num_heads, kv_dim, norm_layer)
+
+        self.max_size = max_size
+        self._set_2d_pos_cache(self.max_size)
 
         self.apply(self._init_weights)
 
     def _set_2d_pos_cache(self,
                           max_size: Tuple[int, int],
-                          device: torch.types.Device = 'cpu'):
-        pos_embed = torch.from_numpy(
-            get_2d_sincos_pos_embed(self.embed_dim,
-                                    max_size,
-                                    version=self.version)).float().to(device)
+                          device: torch.types.Device = "cpu") -> None:
+        pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
+                                                max_size,
+                                                version=(2, 5))
+        pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
         self.register_buffer("pos_embed", pos_embed, persistent=False)
 
     def _adjust_pos_cache(self, tgt_sizes: torch.Tensor,
-                          device: torch.types.Device):
-        max_h = torch.max(tgt_sizes[:, 0])
-        max_w = torch.max(tgt_sizes[:, 1])
+                          device: torch.types.Device) -> None:
+        max_h = tgt_sizes[:, 0].max().item()
+        max_w = tgt_sizes[:, 1].max().item()
+        assert isinstance(max_h, int) and isinstance(max_w, int)
+
         if max_h > self.max_size[0] or max_w > self.max_size[1]:
-            self.max_size = [
+            self.max_size = (
                 max(max_h, self.max_size[0]),
-                max(max_w, self.max_size[1])
-            ]
+                max(max_w, self.max_size[1]),
+            )
             self._set_2d_pos_cache(self.max_size, device)
 
-    def _init_weights(self, m: nn.Module):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def forward_2_5(self,
-                    x: torch.Tensor,
-                    tgt_sizes: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor,
+                tgt_sizes: torch.Tensor) -> torch.Tensor:
         assert x.shape[0] == tgt_sizes.shape[0]
         bs = x.shape[0]
 
@@ -254,25 +355,25 @@ def forward_2_5(self,
 
         self._adjust_pos_cache(tgt_sizes, device=device)
 
-        max_patch_len = torch.max(patch_len)
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
         key_padding_mask = torch.zeros((bs, max_patch_len),
                                        dtype=torch.bool,
                                        device=device)
 
         pos_embed = []
         for i in range(bs):
-            tgt_h, tgt_w = tgt_sizes[i]
+            tgt_h, tgt_w = tgt_sizes[i].tolist()
             pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
                 (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
             key_padding_mask[i, patch_len[i]:] = True
-
         pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed,
                                                     batch_first=True,
                                                     padding_value=0.0).permute(
                                                         1, 0,
                                                         2)  # BLD => L * B * D
-
-        x = self.kv_proj(x)  # B * L * D
+        x, _ = self.kv_proj(x)  # B * L * D
         x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
 
         q = self.ln_q(self.query)  # Q * D
@@ -281,7 +382,8 @@ def forward_2_5(self,
             self._repeat(q, bs),  # Q * B * D
             x + pos_embed,  # L * B * D +  L * B * D
             x,
-            key_padding_mask=key_padding_mask)[0]
+            key_padding_mask=key_padding_mask,
+        )[0]
         #  out: Q * B * D
         x = out.permute(1, 0, 2)  # B * Q * D
 
@@ -289,45 +391,6 @@ def forward_2_5(self,
         x = x @ self.proj
         return x
 
-    def forward_2(self,
-                  x: torch.Tensor,
-                  tgt_sizes: Optional[torch.Tensor] = None,
-                  attn_mask: Optional[torch.Tensor] = None):
-        if self.adaptive:
-            pos_embed = torch.Tensor(
-                get_2d_sincos_pos_embed(self.embed_dim,
-                                        tgt_sizes)).float().to(device=x.device,
-                                                               dtype=x.dtype)
-        else:
-            pos_embed = get_abs_pos(self.pos_embed, tgt_sizes)
-
-        x = self.kv_proj(x)
-        x = self.ln_kv(x).permute(1, 0, 2)
-
-        N = x.shape[1]
-        q = self.ln_q(self.query)
-        out = self.attn(self._repeat(q, N) + self.pos_embed.unsqueeze(1),
-                        x + pos_embed.unsqueeze(1),
-                        x,
-                        attn_mask=attn_mask)[0]
-        x = out.permute(1, 0, 2)
-
-        x = self.ln_post(x)
-        x = x @ self.proj
-        return x
-
-    def forward(self,
-                x: torch.Tensor,
-                tgt_sizes: Optional[torch.Tensor] = None,
-                attn_mask: Optional[torch.Tensor] = None):
-        if self.version == (2, 0):
-            return self.forward_2(x, tgt_sizes=tgt_sizes, attn_mask=attn_mask)
-        else:
-            return self.forward_2_5(x, tgt_sizes=tgt_sizes)
-
-    def _repeat(self, query, N: int):
-        return query.unsqueeze(1).repeat(1, N, 1)
-
 
 def get_max_minicpmv_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config(PretrainedConfig)
@@ -348,10 +411,7 @@ def dummy_image_for_minicpmv(hf_config: PretrainedConfig):
 def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int):
     hf_config = ctx.get_hf_config(PretrainedConfig)
 
-    # image_feature_size = get_max_minicpmv_image_tokens(ctx)
-
     seq_data = dummy_seq_data_for_minicpmv(seq_len)
-
     mm_data = dummy_image_for_minicpmv(hf_config)
 
     return seq_data, mm_data
@@ -376,25 +436,36 @@ def input_processor_for_minicpmv(ctx: InputContext, llm_inputs: LLMInputs):
     pattern = "(<image>./</image>)"
     image = multi_modal_data["image"]
     image_tags = re.findall(pattern, prompt)
-    assert len(image_tags) <= 1
-    text_chunks = prompt.split(pattern)
-    new_prompt = text_chunks[0] \
-        + image_processor.get_slice_image_placeholder(image.size) \
-        + text_chunks[1]
 
-    new_token_ids = tokenizer.encode(new_prompt)
-
-    llm_inputs = LLMInputs(prompt_token_ids=new_token_ids,
-                           prompt=new_prompt,
-                           multi_modal_data=multi_modal_data)
+    if len(image_tags) == 0:
+        new_token_ids = token_ids
+        new_prompt = prompt
+    else:
+        if len(image_tags) > 1:
+            logger.warning("Multiple image input is not supported yet, "
+                           "so any extra image tokens will be treated "
+                           "as plain text.")
+
+        text_chunks = prompt.split(pattern)
+        new_prompt = (text_chunks[0] +
+                      image_processor.get_slice_image_placeholder(image.size) +
+                      "".join(text_chunks[1:]))
+
+        new_token_ids = tokenizer.encode(new_prompt)
+
+    llm_inputs = LLMInputs(
+        prompt_token_ids=new_token_ids,
+        prompt=new_prompt,
+        multi_modal_data=multi_modal_data,
+    )
     return llm_inputs
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
-class MiniCPMV(nn.Module, SupportsVision):
+class MiniCPMVBaseModel(nn.Module, SupportsVision):
+    """
+    The abstract class of MiniCPMV can only be inherited, but cannot be
+    instantiated.
+    """
 
     def __init__(
         self,
@@ -419,8 +490,8 @@ def __init__(
         self.vpm = self.init_vision_module()
         param_dtype = torch.get_default_dtype()
         self.vpm.to(dtype=param_dtype)
-        self.vision_dim = self.vpm.embed_dim if self.version == (2, 0) \
-            else self.vpm.embeddings.embed_dim
+        self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
+                           self.vpm.embeddings.embed_dim)
         self.embed_dim = self.config.hidden_size
         self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
         self.resampler.to(device="cuda", dtype=param_dtype)
@@ -430,248 +501,100 @@ def __init__(
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
-    def init_llm(self,
-                 config: PretrainedConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
-        if self.version == (2, 0):
-            return MiniCPMModel(config,
-                                cache_config=cache_config,
-                                quant_config=quant_config)
-        elif self.version == (2, 5):
-            return LlamaModel(config,
-                              cache_config=cache_config,
-                              quant_config=quant_config)
-        else:
-            return Qwen2Model(config,
-                              cache_config=cache_config,
-                              quant_config=quant_config)
-
-    def init_vision_module(self):
-        if self.version == (2, 0):
-            try:
-                import timm
-            except ImportError:
-                raise ImportError(
-                    'Please install timm==0.9.10') from ImportError
-            default_dtype = torch.get_default_dtype()
-            torch.set_default_dtype(torch.float16)
-            model = timm.create_model('vit_so400m_patch14_siglip_384.webli',
-                                      pretrained=False,
-                                      num_classes=0,
-                                      dynamic_img_size=True,
-                                      dynamic_img_pad=True)
-            torch.set_default_dtype(default_dtype)
-            if isinstance(model, timm.models.VisionTransformer
-                          ) and model.attn_pool is not None:
-                model.attn_pool = torch.nn.Identity()
-
-            if self.config.drop_vision_last_layer:
-                model.blocks = model.blocks[:-1]
-        elif self.version == (2, 5):
-            from transformers.models.idefics2.modeling_idefics2 import (
-                Idefics2VisionTransformer)
-            model = Idefics2VisionTransformer(self.config.vision_config)
-            if self.config.drop_vision_last_layer:
-                model.encoder.layers = model.encoder.layers[:-1]
-        else:
-            from vllm.model_executor.models.na_vit import (
-                SiglipVisionTransformer)
-            if self.config._attn_implementation == 'flash_attention_2':
-                self.config.vision_config._attn_implementation \
-                    = 'flash_attention_2'
-            else:
-                # not support sdpa
-                self.config.vision_config._attn_implementation = 'eager'
-            model = SiglipVisionTransformer(self.config.vision_config)
-            if self.config.drop_vision_last_layer:
-                model.encoder.layers = model.encoder.layers[:-1]
-        return model
-
-    def init_resampler(self, embed_dim: int, vision_dim: int):
-        default_dtype = torch.get_default_dtype()
-        torch.set_default_dtype(torch.float16)
-        if self.version == (2, 0):
-            resampler = Resampler(grid_size=int(
-                math.sqrt(self.config.query_num)),
-                                  num_queries=None,
-                                  embed_dim=embed_dim,
-                                  num_heads=embed_dim // 128,
-                                  kv_dim=vision_dim,
-                                  adaptive=True,
-                                  version=self.version)
+    def get_embedding(
+        self,
+        input_ids: torch.Tensor,
+        image_inputs: Optional[MiniCPMVImageInputs],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        vlm_embedding: torch.Tensor = self.llm.embed_tokens(input_ids)
+        if hasattr(self.config, "scale_emb"):
+            vlm_embedding *= self.config.scale_emb
+
+        if image_inputs is None:  # No image
+            vision_hidden_states = torch.tensor([], device=input_ids.device)
         else:
-            resampler = Resampler(num_queries=self.config.query_num,
-                                  grid_size=None,
-                                  embed_dim=embed_dim,
-                                  num_heads=embed_dim // 128,
-                                  kv_dim=vision_dim,
-                                  adaptive=True,
-                                  version=self.version)
-        torch.set_default_dtype(default_dtype)
-        return resampler
+            vision_hidden_states = self.get_vision_hidden_states(image_inputs)
+
+            # See NOTE in _parse_and_validate_inputs
+            image_bounds = image_inputs["image_bounds"]
+            if len(image_bounds) > 0:
+                image_indices = torch.stack([
+                    torch.arange(start, end, dtype=torch.long)
+                    for start, end in image_bounds.tolist()
+                ]).to(vlm_embedding.device)
+                vlm_embedding.scatter_(
+                    0,
+                    image_indices.view(-1, 1).repeat(1,
+                                                     vlm_embedding.shape[-1]),
+                    vision_hidden_states.view(-1,
+                                              vision_hidden_states.shape[-1]),
+                )
 
-    def get_vision_embedding(self,
-                             pixel_values: List[List[torch.Tensor]],
-                             patch_attn_mask: Optional[torch.Tensor] = None,
-                             tgt_sizes: Optional[torch.Tensor] = None,
-                             version: Tuple[int, int] = (2, 0)):
-        if version == (2, 0):
-            res = []
-            dtype = self.vpm.pos_embed.data.dtype
-            for pixel_value in pixel_values:
-                # V2.0 start
-                H, W = pixel_value[0].shape[-2:]
-                tgt_size = (math.ceil(H / self.vpm.patch_embed.patch_size[0]),
-                            math.ceil(W / self.vpm.patch_embed.patch_size[0]))
-                # V2.0 end
-                vision_embedding = self.vpm.forward_features(
-                    pixel_value.unsqueeze(0).type(dtype))
-                if hasattr(self.vpm, 'num_prefix_tokens'
-                           ) and self.vpm.num_prefix_tokens > 0:
-                    vision_embedding = vision_embedding[:, self.vpm.
-                                                        num_prefix_tokens:]
-                res.append(self.resampler(vision_embedding, tgt_size))
-            return torch.vstack(res)
-        elif version == (2, 5):
-            vision_embedding = self.vpm(
-                pixel_values.type(dtype),
-                patch_attention_mask=patch_attn_mask).last_hidden_state
-            vision_embedding = self.resampler(vision_embedding, tgt_sizes)
-        else:
-            vision_embedding = self.vpm(pixel_values.type(dtype),
-                                        patch_attention_mask=patch_attn_mask,
-                                        tgt_sizes=tgt_sizes).last_hidden_state
+        return vlm_embedding, vision_hidden_states
 
-    def get_image_bounds(self, input_ids: torch.Tensor):
+    def _get_image_bounds(self, input_ids: torch.Tensor) -> torch.Tensor:
         tokenizer = cached_get_tokenizer(self.config._name_or_path,
                                          trust_remote_code=True)
-        if not hasattr(tokenizer, "slice_start_id"):
-            start_cond = input_ids == tokenizer.im_start_id
-            end_cond = input_ids == tokenizer.im_end_id
-        else:
-            start_cond = (input_ids == tokenizer.im_start_id) | (
-                input_ids == tokenizer.slice_start_id)
-            end_cond = (input_ids == tokenizer.im_end_id) | (
-                input_ids == tokenizer.slice_end_id)
+        start_cond = input_ids == tokenizer.im_start_id
+        end_cond = input_ids == tokenizer.im_end_id
+        if hasattr(tokenizer, "slice_start_id"):
+            start_cond |= (input_ids == tokenizer.slice_start_id)
+            end_cond |= (input_ids == tokenizer.slice_end_id)
 
-        image_start_tokens = torch.where(start_cond)[0]
+        image_start_tokens, = torch.where(start_cond)
         image_start_tokens += 1
-        image_end_tokens = torch.where(end_cond)[0]
+        image_end_tokens, = torch.where(end_cond)
         valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
+
         if valid_image_nums == 0:
-            return []
-        image_bound = torch.hstack([
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        return torch.hstack([
             image_start_tokens[:valid_image_nums].unsqueeze(-1),
             image_end_tokens[:valid_image_nums].unsqueeze(-1),
         ])
 
-        return image_bound
-
-    def get_vision_hidden_states(self, data: Dict[str,
-                                                  Union[List[torch.Tensor],
-                                                        torch.Tensor]]):
-        if "vision_hidden_states" not in data:
-            pixel_values = data["pixel_values"]
-            tgt_sizes = data["tgt_sizes"]
-            vision_hidden_states = []
-            if self.version == (2, 0):
-                if pixel_values is not None and len(pixel_values) > 0:
-                    vision_hidden_states = self.get_vision_embedding(
-                        pixel_values)
-                else:
-                    vision_hidden_states = torch.tensor([]).to(
-                        data["input_ids"].device)
-            else:
-                device = self.vpm.embeddings.position_embedding.weight.device
-                dtype = self.vpm.embeddings.position_embedding.weight.dtype
-                all_pixel_values = [
-                    i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
-                ]
-                if all_pixel_values:
-                    tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
-                    max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
-                    all_pixel_values = torch.nn.utils.rnn.pad_sequence(
-                        all_pixel_values, batch_first=True, padding_value=0.0)
-                    B, L, _ = all_pixel_values.shape
-                    all_pixel_values = all_pixel_values.permute(
-                        0, 2, 1).reshape(B, 3, -1, L)
-                    patch_attn_mask = torch.zeros((B, 1, max_patches),
-                                                  dtype=torch.bool,
-                                                  device=device)
-                    if self.version == (2, 5):
-                        for i in range(B):
-                            patch_attn_mask[i, :tgt_sizes[i][0] *
-                                            tgt_sizes[i][1]] = True
-                        vision_embedding = self.vpm(
-                            all_pixel_values.type(dtype),
-                            patch_attention_mask=patch_attn_mask
-                        ).last_hidden_state
-                    else:
-                        for i in range(B):
-                            patch_attn_mask[i, 0, :tgt_sizes[i][0] *
-                                            tgt_sizes[i][1]] = True
-                        vision_embedding = self.vpm(
-                            all_pixel_values.type(dtype),
-                            patch_attention_mask=patch_attn_mask,
-                            tgt_sizes=tgt_sizes).last_hidden_state
-
-                    vision_hidden_states = self.resampler(
-                        vision_embedding, tgt_sizes)
-
-                else:  # no image
-                    dummy_feature = []
-                    vision_hidden_states = dummy_feature
-        else:
-            vision_hidden_states = data["vision_hidden_states"]
-
-        return vision_hidden_states
-
-    def get_embedding(self, data: Dict[str, Union[List[torch.Tensor],
-                                                  torch.Tensor]]):
-        input_ids = data["input_ids"]
-
-        vision_hidden_states = self.get_vision_hidden_states(data)
-        if vision_hidden_states is not None and len(vision_hidden_states) > 0:
-            image_bounds = self.get_image_bounds(input_ids)
-        else:
-            image_bounds = []
-
-        if hasattr(self.config, 'scale_emb'):
-            vlm_embedding = self.llm.embed_tokens(
-                input_ids) * self.config.scale_emb
-        else:
-            vlm_embedding = self.llm.embed_tokens(input_ids)
-        vision_hidden_states = [
-            i.type(vlm_embedding.dtype) if isinstance(i, torch.Tensor) else i
-            for i in vision_hidden_states
-        ]
-
-        if len(vision_hidden_states) > 0 and len(image_bounds) > 0:
-            vision_hidden_states = torch.cat(vision_hidden_states, dim=0)
-            image_indices = torch.stack([
-                torch.arange(r[0], r[1], dtype=torch.long)
-                for r in image_bounds
-            ]).to(vlm_embedding.device)
-            vlm_embedding.scatter_(
-                0,
-                image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
-                vision_hidden_states.view(-1, vision_hidden_states.shape[-1]))
-        return vlm_embedding, vision_hidden_states
-
-    def process_multimodal_inputs(self, inputs: Dict[str,
-                                                     Union[List[torch.Tensor],
-                                                           torch.Tensor]]):
-        pixel_values = []
-        tgt_sizes = []
-        for b in range(len(inputs["pixel_values"])):
-            pixel_values += inputs["pixel_values"][b]
-            tgt_sizes += inputs["tgt_sizes"][b]
-        return {
-            "pixel_values": pixel_values,
-            "input_ids": inputs["input_ids"],
-            "tgt_sizes": tgt_sizes
-        }
+    def _parse_and_validate_inputs(
+        self,
+        input_ids: torch.Tensor,
+        **kwargs: object,
+    ) -> Optional[MiniCPMVImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", [])
+        tgt_sizes = kwargs.pop("tgt_sizes", [])
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of target sizes. "
+                             f"Got type: {type(tgt_sizes)}")
+
+        if len(pixel_values) != len(tgt_sizes):
+            raise ValueError("Inconsistent batch lengths, found: "
+                             f"{len(pixel_values)} vs. {len(tgt_sizes)}")
+
+        pixel_values_flat: List[torch.Tensor] = []
+        tgt_sizes_flat: List[torch.Tensor] = []
+        for b in range(len(pixel_values)):
+            pixel_values_flat += pixel_values[b]
+            tgt_sizes_flat += tgt_sizes[b]
+
+        # NOTE: Input IDs does not contain image tokens during memory profiling,
+        # so we allow it to be empty
+        if len(pixel_values_flat) != len(tgt_sizes_flat):
+            raise ValueError("Inconsistent flattened lengths, found: "
+                             f"{len(pixel_values_flat)} vs. "
+                             f"{len(tgt_sizes_flat)}")
+
+        if len(pixel_values_flat) == 0:
+            return None
+
+        return MiniCPMVImageInputs(
+            image_bounds=self._get_image_bounds(input_ids),
+            pixel_values=pixel_values_flat,
+            tgt_sizes=torch.stack(tgt_sizes_flat),
+        )
 
     def forward(
         self,
@@ -680,23 +603,20 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        **kwargs: object,
-    ):
-        inputs = {
-            "pixel_values": kwargs.pop("pixel_values", []),
-            "input_ids": input_ids,
-            "tgt_sizes": kwargs.pop("tgt_sizes", None),
-        }
-        inputs = self.process_multimodal_inputs(inputs)
-
-        vlm_embeddings, vision_hidden_states = self.get_embedding(inputs)
-
-        output = self.llm(input_ids=None,
-                          positions=positions,
-                          kv_caches=kv_caches,
-                          attn_metadata=attn_metadata,
-                          intermediate_tensors=intermediate_tensors,
-                          inputs_embeds=vlm_embeddings)
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs)
+
+        vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
+
+        output = self.llm(
+            input_ids=None,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=vlm_embeddings,
+        )
         return output
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -735,13 +655,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # the checkpoint. Skip them.
                 continue
             use_default_weight_loading = False
-            if "vpm" in name or 'resampler' in name:
-                # We only do sharding for language model and
-                # not vision model for now.
+            if self.is_default_weight_loading(name):
                 use_default_weight_loading = True
             else:
-                for (param_name, weight_name,
-                     shard_id) in stacked_params_mapping:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
                     if weight_name not in name:
                         continue
                     param = params_dict[name.replace(weight_name, param_name)]
@@ -755,3 +672,341 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+
+    def init_llm(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_vision_module(self) -> nn.Module:
+        raise NotImplementedError
+
+    def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
+        raise NotImplementedError
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def get_vision_hidden_states(self,
+                                 data: MiniCPMVImageInputs) -> torch.Tensor:
+        raise NotImplementedError
+
+    def is_default_weight_loading(self, name: str) -> bool:
+        raise NotImplementedError
+
+
+class MiniCPMV2(MiniCPMVBaseModel):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(config, multimodal_config, cache_config, quant_config)
+        assert self.version == (2, 0)
+
+    def init_llm(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        return MiniCPMModel(config,
+                            cache_config=cache_config,
+                            quant_config=quant_config)
+
+    def init_vision_module(self) -> nn.Module:
+        # TODO :refactor this vision model
+        try:
+            import timm
+        except ImportError:
+            raise ImportError("Please install timm==0.9.10") from ImportError
+        with set_default_torch_dtype(torch.float16):
+            model = timm.create_model(
+                "vit_so400m_patch14_siglip_384.webli",
+                pretrained=False,
+                num_classes=0,
+                dynamic_img_size=True,
+                dynamic_img_pad=True,
+            )
+
+        if (isinstance(model, timm.models.VisionTransformer)
+                and model.attn_pool is not None):
+            model.attn_pool = torch.nn.Identity()
+
+        if self.config.drop_vision_last_layer:
+            model.blocks = model.blocks[:-1]
+
+        return model
+
+    def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            resampler = Resampler2(
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                grid_size=int(math.sqrt(self.config.query_num)),
+                kv_dim=vision_dim,
+                adaptive=True,
+            )
+
+        return resampler
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        res = []
+        dtype = self.vpm.pos_embed.data.dtype
+        for pixel_value in pixel_values:
+            H, W = pixel_value[0].shape[-2:]
+            tgt_size = (
+                math.ceil(H / self.vpm.patch_embed.patch_size[0]),
+                math.ceil(W / self.vpm.patch_embed.patch_size[0]),
+            )
+            vision_embedding = self.vpm.forward_features(
+                pixel_value.unsqueeze(0).type(dtype))
+            if (hasattr(self.vpm, "num_prefix_tokens")
+                    and self.vpm.num_prefix_tokens > 0):
+                vision_embedding = vision_embedding[:, self.vpm.
+                                                    num_prefix_tokens:]
+            res.append(self.resampler(vision_embedding, tgt_size))
+        return torch.vstack(res)
+
+    def get_vision_hidden_states(self,
+                                 data: MiniCPMVImageInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+
+        return self.get_vision_embedding(pixel_values)
+
+    def is_default_weight_loading(self, name: str) -> bool:
+        return "resampler" in name or "vpm" in name
+
+
+class MiniCPMV2_5(MiniCPMVBaseModel):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(config, multimodal_config, cache_config, quant_config)
+        assert self.version == (2, 5)
+
+    def init_llm(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        return LlamaModel(config,
+                          cache_config=cache_config,
+                          quant_config=quant_config)
+
+    def init_vision_module(self) -> nn.Module:
+        model = Idefics2VisionTransformer(self.config.vision_config)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+            )
+        return resampler
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(pixel_values,
+                                    patch_attention_mask=patch_attn_mask)
+        vision_embedding = self.resampler(vision_embedding, tgt_sizes)
+        return vision_embedding
+
+    def get_vision_hidden_states(self,
+                                 data: MiniCPMVImageInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+        tgt_sizes = data["tgt_sizes"]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0)
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2,
+                                                    1).reshape(B, 3, -1, L)
+
+        patch_attn_mask = torch.zeros((B, 1, max_patches),
+                                      dtype=torch.bool,
+                                      device=device)
+        for i in range(B):
+            patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+
+        return self.get_vision_embedding(all_pixel_values.type(dtype),
+                                         patch_attn_mask, tgt_sizes)
+
+    def is_default_weight_loading(self, name: str) -> bool:
+        return "resampler" in name
+
+
+# NOTE: Currently, information about this model is unavailable. We are
+# temporarily using `MiniCPMVQwen2` as it's name. The name may need
+# to be modified in the future.
+class MiniCPMVQwen2(MiniCPMVBaseModel):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(config, multimodal_config, cache_config, quant_config)
+
+    def init_llm(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        return Qwen2Model(config,
+                          cache_config=cache_config,
+                          quant_config=quant_config)
+
+    def init_vision_module(self) -> nn.Module:
+        # A custom version of SiglipVisionTransformer, won't work with TP
+        from vllm.model_executor.models.na_vit import SiglipVisionTransformer
+
+        if self.config._attn_implementation == "flash_attention_2":
+            self.config.vision_config._attn_implementation = "flash_attention_2"
+        else:
+            # not support sdpa
+            self.config.vision_config._attn_implementation = "eager"
+        model = SiglipVisionTransformer(self.config.vision_config)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+            )
+
+        return resampler
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        ).last_hidden_state
+        return vision_embedding
+
+    def get_vision_hidden_states(self,
+                                 data: MiniCPMVImageInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+        tgt_sizes = data["tgt_sizes"]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0)
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2,
+                                                    1).reshape(B, 3, -1, L)
+
+        patch_attn_mask = torch.zeros((B, 1, max_patches),
+                                      dtype=torch.bool,
+                                      device=device)
+        for i in range(B):
+            patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        ).last_hidden_state
+
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def is_default_weight_loading(self, name: str) -> bool:
+        return "resampler" in name or "vpm" in name
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
+class MiniCPMV(MiniCPMVBaseModel):
+    """
+    Different versions of MiniCPMV use different visual encoders and LLMs,
+    which is not conducive to the current integration logic of LoRA and
+    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
+    """
+
+    def __new__(
+        cls,
+        config: PretrainedConfig,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        if not hasattr(config, "version"):
+            if config.hidden_size == 2304 and config.query_num == 64:
+                version = (2, 0)
+            else:
+                version = (2, 5)
+        else:
+            version = str(config.version).split(".")
+            version = tuple([int(x) for x in version])
+        # Dispatch class based on version
+        if version == (2, 0):
+            instance_class = MiniCPMV2
+        elif version == (2, 5):
+            instance_class = MiniCPMV2_5
+        else:
+            instance_class = MiniCPMVQwen2
+        return instance_class(config, multimodal_config, cache_config,
+                              quant_config)
diff --git a/vllm/model_executor/models/na_vit.py b/vllm/model_executor/models/na_vit.py
index 871e4128b66e1..1d6f26f0d4fb5 100644
--- a/vllm/model_executor/models/na_vit.py
+++ b/vllm/model_executor/models/na_vit.py
@@ -100,7 +100,7 @@ def _get_unpad_data(attention_mask):
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
     cu_seqlens = F.pad(
-        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,

From b1c9aa3daa7dcd981f0f77231b46883624b72dd0 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sun, 4 Aug 2024 16:13:18 +0200
Subject: [PATCH 158/167] [Bugfix] [SpecDecode] Default
 speculative_draft_tensor_parallel_size to 1 when using MLPSpeculator (#7105)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/config.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 028f4eed8f4a2..0524514f6633a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1068,7 +1068,7 @@ def maybe_create_spec_config(
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
                     target_parallel_config,
-                    speculative_draft_tensor_parallel_size))
+                    speculative_draft_tensor_parallel_size, draft_hf_config))
 
         if num_speculative_tokens is None:
             raise ValueError(
@@ -1136,15 +1136,23 @@ def _maybe_override_draft_max_model_len(
     @staticmethod
     def create_draft_parallel_config(
         target_parallel_config: ParallelConfig,
-        speculative_draft_tensor_parallel_size: Optional[int]
+        speculative_draft_tensor_parallel_size: Optional[int],
+        draft_hf_config: PretrainedConfig,
     ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
         This is mostly a copy of the target parallel config, except the tp_size.
         """
         if speculative_draft_tensor_parallel_size is None:
-            speculative_draft_tensor_parallel_size = \
-                  target_parallel_config.tensor_parallel_size
+            if draft_hf_config.model_type == "mlp_speculator":
+                speculative_draft_tensor_parallel_size = 1
+                if target_parallel_config.tensor_parallel_size > 1:
+                    logger.warning(
+                        "MLPSpeculator cannot currently be run with tp>1; "
+                        "setting speculative_draft_tensor_parallel_size=1")
+            else:
+                speculative_draft_tensor_parallel_size = \
+                    target_parallel_config.tensor_parallel_size
         elif speculative_draft_tensor_parallel_size != 1:
             # TODO(wooyeon): allow tp values larger than 1
             raise ValueError(

From 16a1cc9bb2b4bba82d78f329e5a89b44a5523ac8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 4 Aug 2024 11:31:51 -0700
Subject: [PATCH 159/167] [misc][distributed] improve libcudart.so finding
 (#7127)

---
 .../device_communicators/cuda_wrapper.py      | 44 +++++++++----------
 .../custom_all_reduce_utils.py                |  4 +-
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 5cac3c1d57bca..9c7f41a1f9d62 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -4,9 +4,6 @@
 """
 
 import ctypes
-import glob
-import os
-import sys
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
@@ -36,24 +33,25 @@ class Function:
     argtypes: List[Any]
 
 
-def get_pytorch_default_cudart_library_path() -> str:
-    # code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa
-    lib_folder = "cuda_runtime"
-    lib_name = "libcudart.so.*[0-9]"
-    lib_path = None
-    for path in sys.path:
-        nvidia_path = os.path.join(path, "nvidia")
-        if not os.path.exists(nvidia_path):
-            continue
-        candidate_lib_paths = glob.glob(
-            os.path.join(nvidia_path, lib_folder, "lib", lib_name))
-        if candidate_lib_paths and not lib_path:
-            lib_path = candidate_lib_paths[0]
-        if lib_path:
-            break
-    if not lib_path:
-        raise ValueError(f"{lib_name} not found in the system path {sys.path}")
-    return lib_path
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """ # noqa
+    found = False
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found = True
+                break
+    if not found:
+        # the library is not loaded in the current process
+        return None
+    start = line.index("/")
+    path = line[start:].strip()
+    return path
 
 
 class CudaRTLibrary:
@@ -100,7 +98,9 @@ class CudaRTLibrary:
 
     def __init__(self, so_file: Optional[str] = None):
         if so_file is None:
-            so_file = get_pytorch_default_cudart_library_path()
+            so_file = find_loaded_library("libcudart.so")
+            assert so_file is not None, \
+                "libcudart.so is not loaded in the current process"
         if so_file not in CudaRTLibrary.path_to_library_cache:
             lib = ctypes.CDLL(so_file)
             CudaRTLibrary.path_to_library_cache[so_file] = lib
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index d27d7ee9a2496..37ae94c671e33 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -145,6 +145,7 @@ def can_actually_p2p(
     p_tgt.start()
     p_src.join()
     p_tgt.join()
+    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
     result: List[bool] = []
     for src, tgt in zip(batch_src, batch_tgt):
         a = result_queue.get()
@@ -221,7 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
             # wrap raised exception to provide more information
             raise RuntimeError(
                 f"Error happened when batch testing "
-                f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
+                f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
+                f"{returned.stderr.decode()}") from e
         result = pickle.loads(returned.stdout)
         for _i, _j, r in zip(batch_src, batch_tgt, result):
             cache[f"{_i}->{_j}"] = r

From f80ab3521ca2aa74e121e26a27b87da7a1065939 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 5 Aug 2024 06:37:08 +0800
Subject: [PATCH 160/167] Clean up remaining Punica C information (#7027)

---
 .github/workflows/clang-format.yml | 6 ------
 cmake/utils.cmake                  | 2 +-
 format.sh                          | 6 ------
 vllm/config.py                     | 2 +-
 vllm/lora/layers.py                | 2 +-
 5 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index e9b6e28fa6bcb..79b85d8cad0d5 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -30,12 +30,6 @@ jobs:
       run: |
         EXCLUDES=(
             'csrc/moe/topk_softmax_kernels.cu'
-            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
-            'csrc/punica/bgmv/bgmv_config.h'
-            'csrc/punica/bgmv/bgmv_impl.cuh'
-            'csrc/punica/bgmv/vec_dtypes.cuh'
-            'csrc/punica/punica_ops.cu'
-            'csrc/punica/type_convert.h'
         )
         find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
             | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 4869cad541135..69998b45be70a 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -181,7 +181,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     #
     # The torch cmake setup hardcodes the detected architecture flags in
     # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
-    # can't modified on a per-target basis, e.g. for the `punica` extension.
+    # can't modified on a per-target basis.
     # So, all the `-gencode` flags need to be extracted and removed from
     # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
     # Since it's not possible to use `target_compiler_options` for adding target
diff --git a/format.sh b/format.sh
index abc688c702aa6..baaebc811d405 100755
--- a/format.sh
+++ b/format.sh
@@ -242,12 +242,6 @@ echo 'vLLM isort: Done'
 # NOTE: Keep up to date with .github/workflows/clang-format.yml
 CLANG_FORMAT_EXCLUDES=(
     'csrc/moe/topk_softmax_kernels.cu'
-    'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
-    'csrc/punica/bgmv/bgmv_config.h'
-    'csrc/punica/bgmv/bgmv_impl.cuh'
-    'csrc/punica/bgmv/vec_dtypes.cuh'
-    'csrc/punica/punica_ops.cu'
-    'csrc/punica/type_convert.h'
 )
 
 # Format specified files with clang-format
diff --git a/vllm/config.py b/vllm/config.py
index 0524514f6633a..35945e34452d2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1304,7 +1304,7 @@ class LoRAConfig:
     long_lora_scaling_factors: Optional[Tuple[float]] = None
 
     def __post_init__(self):
-        # Keep this in sync with csrc/punica/bgmv/bgmv_config.h
+        # TODO: Increase the range of rank
         possible_max_ranks = (8, 16, 32, 64)
         possible_lora_extra_vocab_size = (0, 256, 512)
         if self.max_lora_rank not in possible_max_ranks:
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 42ec99e6ea2c8..d3978ff6f4ff1 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1073,7 +1073,7 @@ def create_lora_weights(
         lora_config: LoRAConfig,
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
-        # Keep this in sync with csrc/punica/bgmv/bgmv_config.h
+        # TODO: Verify if this condition can be relaxed
         if 32000 < self.base_layer.vocab_size > 128512:
             raise ValueError("When using LoRA, vocab size must be "
                              "32000 >= vocab_size <= 128512")

From 7b86e7c9cd6541abdf5d083b0a8a98ee667a91d1 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Mon, 5 Aug 2024 09:23:17 +0800
Subject: [PATCH 161/167] [Model] Add multi-image support for minicpmv (#7122)

Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/conftest.py                      |   5 +-
 tests/models/test_minicpmv.py          | 146 ++++++++++++++++++++++---
 vllm/model_executor/models/minicpmv.py |  56 ++++++----
 vllm/multimodal/image.py               |   2 +-
 4 files changed, 172 insertions(+), 37 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 999ca60d07a4f..c7a349f1e9e2a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,7 +3,7 @@
 import os
 import sys
 from collections import UserList
-from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar
+from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar, Union
 
 import pytest
 import torch
@@ -508,7 +508,8 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[Union[List[Image.Image],
+                               List[List[Image.Image]]]] = None,
         stop_token_ids: Optional[List[int]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py
index c57f0f8c08548..c3b2a7bcbaafd 100644
--- a/tests/models/test_minicpmv.py
+++ b/tests/models/test_minicpmv.py
@@ -14,6 +14,18 @@
 
 pytestmark = pytest.mark.vlm
 
+
+class NestedInputs(UserDict):
+
+    def __init__(self, model_inputs: BatchFeature):
+        super().__init__({"model_inputs": model_inputs})
+
+        self.model_inputs = model_inputs
+
+    def to(self, device: torch.types.Device):
+        return NestedInputs(self.model_inputs.to(device))
+
+
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -23,7 +35,7 @@
     "cherry_blossom":
         "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
         "(<image>./</image>)\nWhat is the season?<|eot_id|>" \
-        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n",
 })
 
 models = ["openbmb/MiniCPM-Llama3-V-2_5"]
@@ -94,22 +106,10 @@ def run_test(
         ]
 
     with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
-
-        class NestedInputs(UserDict):
-
-            def __init__(self, model_inputs: BatchFeature):
-                super().__init__({"model_inputs": model_inputs})
-
-                self.model_inputs = model_inputs
-
-            def to(self, device: torch.types.Device):
-                return NestedInputs(self.model_inputs.to(device))
-
         hf_processor = hf_model.processor
         hf_model.processor = lambda **kw: NestedInputs(
             hf_processor(**kw)  # type: ignore
         )
-
         hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
@@ -161,3 +161,123 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
         num_logprobs=num_logprobs,
         tensor_parallel_size=1,
     )
+
+
+HF_MULTIIMAGE_IMAGE_PROMPT = \
+    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
+    "(<image>./</image>)\n(<image>./</image>)\n" \
+    "Describe these images.<|eot_id|>" \
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+
+
+def run_multi_image_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images,
+                                                stop_token_ids=stop_token_ids)
+            for prompts, images in inputs_per_case
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
+        hf_processor = hf_model.processor
+        hf_model.processor = lambda **kw: NestedInputs(
+            hf_processor(**kw)  # type: ignore
+        )
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images,
+                                                    tokenizer=tokenizer)
+            for prompts, images in inputs_per_case
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=[
+                trunc_hf_output(hf_output) for hf_output in hf_outputs
+            ],
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_tokens: int,
+                             num_logprobs: int) -> None:
+    run_multi_image_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 095bb49f6ba76..0388259595628 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -392,6 +392,20 @@ def forward(self, x: torch.Tensor,
         return x
 
 
+def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
+    version_float = getattr(config, "version", None)
+
+    # The old configs do not include version number
+    # TODO: Remove this after the HF repos are updated
+    if version_float is None:
+        if config.hidden_size == 2304 and config.query_num == 64:
+            return (2, 0)
+        return (2, 5)
+
+    version_str = str(version_float)
+    return tuple(int(x) for x in version_str.split("."))
+
+
 def get_max_minicpmv_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config(PretrainedConfig)
     return getattr(hf_config, "query_num", 64)
@@ -421,36 +435,43 @@ def input_processor_for_minicpmv(ctx: InputContext, llm_inputs: LLMInputs):
     multi_modal_data = llm_inputs.get("multi_modal_data")
     if multi_modal_data is None or "image" not in multi_modal_data:
         return llm_inputs
-
     model_config = ctx.model_config
-
+    version = get_version_by_config(model_config.hf_config)
     tokenizer = cached_get_tokenizer(model_config.tokenizer,
                                      trust_remote_code=True)
+    image_processor = cached_get_image_processor(model_config.tokenizer)
+
+    def get_placeholder(image_size: Tuple[int, int], num_image: int):
+        if version == (2, 0) or version == (2, 5):
+            return image_processor. \
+                get_slice_image_placeholder(image_size)
+        return image_processor. \
+            get_slice_image_placeholder(image_size, num_image)
 
     prompt = llm_inputs.get("prompt")
     if prompt is None:
         token_ids = llm_inputs.get("prompt_token_ids")
         prompt = tokenizer.decode(token_ids)
-    image_processor = cached_get_image_processor(model_config.tokenizer)
 
     pattern = "(<image>./</image>)"
-    image = multi_modal_data["image"]
+    images = multi_modal_data["image"]
+    if isinstance(images, Image.Image):
+        images = [images]
     image_tags = re.findall(pattern, prompt)
 
     if len(image_tags) == 0:
         new_token_ids = token_ids
         new_prompt = prompt
     else:
-        if len(image_tags) > 1:
-            logger.warning("Multiple image input is not supported yet, "
-                           "so any extra image tokens will be treated "
-                           "as plain text.")
-
         text_chunks = prompt.split(pattern)
-        new_prompt = (text_chunks[0] +
-                      image_processor.get_slice_image_placeholder(image.size) +
-                      "".join(text_chunks[1:]))
-
+        new_prompt_chunks: List[str] = []
+        for i in range(len(images)):
+            new_prompt_chunks += [
+                text_chunks[i],
+                get_placeholder(images[i].size, i)
+            ]
+        new_prompt_chunks.append(text_chunks[-1])
+        new_prompt = "".join(new_prompt_chunks)
         new_token_ids = tokenizer.encode(new_prompt)
 
     llm_inputs = LLMInputs(
@@ -478,14 +499,7 @@ def __init__(
         self.config = config
         self.multimodal_config = multimodal_config
 
-        if not hasattr(self.config, "version"):
-            if self.config.hidden_size == 2304 and self.config.query_num == 64:
-                self.version = (2, 0)
-            else:
-                self.version = (2, 5)
-        else:
-            self.version = str(self.config.version).split(".")
-            self.version = tuple([int(x) for x in self.version])
+        self.version = get_version_by_config(self.config)
         self.llm = self.init_llm(config, cache_config, quant_config)
         self.vpm = self.init_vision_module()
         param_dtype = torch.get_default_dtype()
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 3b37ce9149fb8..b6a3909e95632 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -113,7 +113,7 @@ def _get_hf_image_processor(self, model_config: ModelConfig):
     def _default_input_mapper(self, ctx: InputContext,
                               data: object) -> MultiModalInputs:
         model_config = ctx.model_config
-        if isinstance(data, Image.Image):
+        if isinstance(data, (Image.Image, list)):
             image_processor = self._get_hf_image_processor(model_config)
             if image_processor is None:
                 raise RuntimeError("No HuggingFace processor is available "

From cc08fc7225616aeb6709a2e75e5ac47ace124985 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 5 Aug 2024 11:40:51 +0800
Subject: [PATCH 162/167] [Frontend] Reapply "Factor out code for running
 uvicorn" (#7095)

---
 vllm/entrypoints/api_server.py        | 77 ++++++++++++++++--------
 vllm/entrypoints/launcher.py          | 46 +++++++++++++++
 vllm/entrypoints/openai/api_server.py | 84 +++++++++------------------
 3 files changed, 125 insertions(+), 82 deletions(-)
 create mode 100644 vllm/entrypoints/launcher.py

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 66941442c8c9c..672382717d119 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -5,21 +5,23 @@
 We are also not going to accept PRs modifying this file, please
 change `vllm/entrypoints/openai/api_server.py` instead.
 """
-
+import asyncio
 import json
 import ssl
-from typing import AsyncGenerator
+from argparse import Namespace
+from typing import Any, AsyncGenerator, Optional
 
-import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.launcher import serve_http
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
 
@@ -81,6 +83,53 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     return JSONResponse(ret)
 
 
+def build_app(args: Namespace) -> FastAPI:
+    global app
+
+    app.root_path = args.root_path
+    return app
+
+
+async def init_app(
+    args: Namespace,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+) -> FastAPI:
+    app = build_app(args)
+
+    global engine
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = (llm_engine
+              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
+                  engine_args, usage_context=UsageContext.API_SERVER))
+
+    return app
+
+
+async def run_server(args: Namespace,
+                     llm_engine: Optional[AsyncLLMEngine] = None,
+                     **uvicorn_kwargs: Any) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    app = await init_app(args, llm_engine)
+
+    shutdown_task = await serve_http(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+        ssl_ca_certs=args.ssl_ca_certs,
+        ssl_cert_reqs=args.ssl_cert_reqs,
+        **uvicorn_kwargs,
+    )
+
+    await shutdown_task
+
+
 if __name__ == "__main__":
     parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
@@ -105,25 +154,5 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     parser.add_argument("--log-level", type=str, default="debug")
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngine.from_engine_args(
-        engine_args, usage_context=UsageContext.API_SERVER)
-
-    app.root_path = args.root_path
 
-    logger.info("Available routes are:")
-    for route in app.routes:
-        if not hasattr(route, 'methods'):
-            continue
-        methods = ', '.join(route.methods)
-        logger.info("Route: %s, Methods: %s", route.path, methods)
-
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level=args.log_level,
-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
-                ssl_keyfile=args.ssl_keyfile,
-                ssl_certfile=args.ssl_certfile,
-                ssl_ca_certs=args.ssl_ca_certs,
-                ssl_cert_reqs=args.ssl_cert_reqs)
+    asyncio.run(run_server(args))
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
new file mode 100644
index 0000000000000..00826762f76a1
--- /dev/null
+++ b/vllm/entrypoints/launcher.py
@@ -0,0 +1,46 @@
+import asyncio
+import signal
+from typing import Any
+
+import uvicorn
+from fastapi import FastAPI
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+async def serve_http(app: FastAPI, **uvicorn_kwargs: Any):
+    logger.info("Available routes are:")
+    for route in app.routes:
+        methods = getattr(route, "methods", None)
+        path = getattr(route, "path", None)
+
+        if methods is None or path is None:
+            continue
+
+        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
+
+    config = uvicorn.Config(app, **uvicorn_kwargs)
+    server = uvicorn.Server(config)
+
+    loop = asyncio.get_running_loop()
+
+    server_task = loop.create_task(server.serve())
+
+    def signal_handler() -> None:
+        # prevents the uvicorn signal handler to exit early
+        server_task.cancel()
+
+    async def dummy_shutdown() -> None:
+        pass
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+        return dummy_shutdown()
+    except asyncio.CancelledError:
+        logger.info("Gracefully stopping http server")
+        return server.shutdown()
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e330ee81f7e44..a0190f3d66b10 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,15 +2,13 @@
 import importlib
 import inspect
 import re
-import signal
+from argparse import Namespace
 from contextlib import asynccontextmanager
 from http import HTTPStatus
 from multiprocessing import Process
 from typing import AsyncIterator, Set
 
-import fastapi
-import uvicorn
-from fastapi import APIRouter, Request
+from fastapi import APIRouter, FastAPI, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -22,6 +20,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.protocol import AsyncEngineClient
+from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 # yapf conflicts with isort for this block
@@ -71,7 +70,7 @@ def model_is_embedding(model_name: str) -> bool:
 
 
 @asynccontextmanager
-async def lifespan(app: fastapi.FastAPI):
+async def lifespan(app: FastAPI):
 
     async def _force_log():
         while True:
@@ -135,7 +134,7 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
 router = APIRouter()
 
 
-def mount_metrics(app: fastapi.FastAPI):
+def mount_metrics(app: FastAPI):
     # Add prometheus asgi middleware to route /metrics requests
     metrics_route = Mount("/metrics", make_asgi_app())
     # Workaround for 307 Redirect for /metrics
@@ -225,8 +224,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-def build_app(args):
-    app = fastapi.FastAPI(lifespan=lifespan)
+def build_app(args: Namespace) -> FastAPI:
+    app = FastAPI(lifespan=lifespan)
     app.include_router(router)
     app.root_path = args.root_path
 
@@ -274,11 +273,10 @@ async def authentication(request: Request, call_next):
     return app
 
 
-async def build_server(
+async def init_app(
     async_engine_client: AsyncEngineClient,
-    args,
-    **uvicorn_kwargs,
-) -> uvicorn.Server:
+    args: Namespace,
+) -> FastAPI:
     app = build_app(args)
 
     if args.served_model_name is not None:
@@ -334,62 +332,31 @@ async def build_server(
     )
     app.root_path = args.root_path
 
-    logger.info("Available routes are:")
-    for route in app.routes:
-        if not hasattr(route, 'methods'):
-            continue
-        methods = ', '.join(route.methods)
-        logger.info("Route: %s, Methods: %s", route.path, methods)
-
-    config = uvicorn.Config(
-        app,
-        host=args.host,
-        port=args.port,
-        log_level=args.uvicorn_log_level,
-        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
-        ssl_keyfile=args.ssl_keyfile,
-        ssl_certfile=args.ssl_certfile,
-        ssl_ca_certs=args.ssl_ca_certs,
-        ssl_cert_reqs=args.ssl_cert_reqs,
-        **uvicorn_kwargs,
-    )
-
-    return uvicorn.Server(config)
+    return app
 
 
 async def run_server(args, **uvicorn_kwargs) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
-    shutdown_task = None
     async with build_async_engine_client(args) as async_engine_client:
-
-        server = await build_server(
-            async_engine_client,
-            args,
+        app = await init_app(async_engine_client, args)
+
+        shutdown_task = await serve_http(
+            app,
+            host=args.host,
+            port=args.port,
+            log_level=args.uvicorn_log_level,
+            timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+            ssl_keyfile=args.ssl_keyfile,
+            ssl_certfile=args.ssl_certfile,
+            ssl_ca_certs=args.ssl_ca_certs,
+            ssl_cert_reqs=args.ssl_cert_reqs,
             **uvicorn_kwargs,
         )
 
-        loop = asyncio.get_running_loop()
-
-        server_task = loop.create_task(server.serve())
-
-        def signal_handler() -> None:
-            # prevents the uvicorn signal handler to exit early
-            server_task.cancel()
-
-        loop.add_signal_handler(signal.SIGINT, signal_handler)
-        loop.add_signal_handler(signal.SIGTERM, signal_handler)
-
-        try:
-            await server_task
-        except asyncio.CancelledError:
-            logger.info("Gracefully stopping http server")
-            shutdown_task = server.shutdown()
-
-    if shutdown_task:
-        # NB: Await server shutdown only after the backend context is exited
-        await shutdown_task
+    # NB: Await server shutdown only after the backend context is exited
+    await shutdown_task
 
 
 if __name__ == "__main__":
@@ -399,4 +366,5 @@ def signal_handler() -> None:
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
     args = parser.parse_args()
+
     asyncio.run(run_server(args))

From c0d8f1636c58f5464e512eaabfed5aa29f2c5b7d Mon Sep 17 00:00:00 2001
From: Jungho Christopher Cho <wjdgh6655@gmail.com>
Date: Mon, 5 Aug 2024 15:22:12 +0900
Subject: [PATCH 163/167] [Model] SiglipVisionModel ported from transformers
 (#6942)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_vision_language.py |   3 +-
 vllm/model_executor/models/paligemma.py       |  79 +--
 vllm/model_executor/models/siglip.py          | 621 ++++++++++++++++++
 3 files changed, 650 insertions(+), 53 deletions(-)
 create mode 100644 vllm/model_executor/models/siglip.py

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 846246a2062a6..ce9dc9e457c09 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -65,7 +65,8 @@ def run_phi3v(question):
 # PaliGemma
 def run_paligemma(question):
 
-    prompt = question
+    # PaliGemma has special prompt format for VQA
+    prompt = "caption en"
     llm = LLM(model="google/paligemma-3b-mix-224")
 
     return llm, prompt
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index fe91611cd30ff..9ba53b8b59a2f 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,9 +1,8 @@
 from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
-from PIL import Image
 from torch import nn
-from transformers import PaliGemmaConfig, SiglipVisionConfig, SiglipVisionModel
+from transformers import PaliGemmaConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
@@ -18,9 +17,11 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsVision
+from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
+                     dummy_seq_data_for_siglip, get_max_siglip_image_tokens)
 from .utils import merge_vision_embeddings
 
 logger = init_logger(__name__)
@@ -32,55 +33,22 @@
 
 def get_max_paligemma_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config(PaliGemmaConfig)
-    text_config = hf_config.text_config
-
-    return text_config.num_image_tokens
-
-
-def dummy_seq_data_for_paligemma(
-    hf_config: PaliGemmaConfig,
-    seq_len: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = hf_config.text_config.num_image_tokens
-    else:
-        image_feature_size = image_feature_size_override
-
-    token_ids = [image_token_id] * image_feature_size
-    token_ids += [0] * (seq_len - image_feature_size)
-    return SequenceData(token_ids)
-
-
-def dummy_image_for_paligemma(
-    hf_config: SiglipVisionConfig,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = height = hf_config.image_size
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
+    vision_config = hf_config.vision_config
 
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image}
+    return get_max_siglip_image_tokens(vision_config)
 
 
 def dummy_data_for_paligemma(ctx: InputContext, seq_len: int):
     hf_config = ctx.get_hf_config(PaliGemmaConfig)
     vision_config = hf_config.vision_config
 
-    seq_data = dummy_seq_data_for_paligemma(
-        hf_config,
+    seq_data = dummy_seq_data_for_siglip(
+        vision_config,
         seq_len,
         image_token_id=hf_config.image_token_index,
     )
 
-    mm_data = dummy_image_for_paligemma(vision_config)
+    mm_data = dummy_image_for_siglip(vision_config)
     return seq_data, mm_data
 
 
@@ -208,30 +176,37 @@ def _parse_and_validate_image_input(
             data=self._validate_pixel_values(pixel_values),
         )
 
-    def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
-                                  pixel_values: torch.Tensor) -> torch.Tensor:
+    def _image_pixels_to_features(
+        self,
+        vision_tower: SiglipVisionModel,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
 
         target_dtype = vision_tower.get_input_embeddings().weight.dtype
-        image_outputs = vision_tower(pixel_values.to(dtype=target_dtype),
-                                     output_hidden_states=True)
-
-        selected_image_features = image_outputs.last_hidden_state
+        image_features = vision_tower(pixel_values.to(dtype=target_dtype))
 
-        return selected_image_features
+        return image_features
 
     def _process_image_pixels(
-            self, inputs: PaliGemmaImagePixelInputs) -> torch.Tensor:
+        self,
+        inputs: PaliGemmaImagePixelInputs,
+    ) -> torch.Tensor:
         assert self.vision_tower is not None
 
         pixel_values = inputs["data"]
 
-        return self._image_pixels_to_features(self.vision_tower, pixel_values)
+        return self._image_pixels_to_features(
+            self.vision_tower,
+            pixel_values,
+        )
 
     def _process_image_input(
-            self, image_input: PaliGemmaImageInputs) -> torch.Tensor:
+        self,
+        image_input: PaliGemmaImageInputs,
+    ) -> torch.Tensor:
 
         assert self.vision_tower is not None
-        image_features = self._process_image_pixels(image_input)
+        image_features = self._process_image_pixels(image_input, )
 
         return self.multi_modal_projector(image_features)
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
new file mode 100644
index 0000000000000..6faef45c9a6d3
--- /dev/null
+++ b/vllm/model_executor/models/siglip.py
@@ -0,0 +1,621 @@
+"""Implementation of SiglipVisionModel intended to be only used
+within a vision language model."""
+
+import math
+from typing import Optional, Tuple
+
+import torch
+from PIL import Image
+from torch import nn
+from transformers import SiglipConfig, SiglipVisionConfig
+from transformers.models.siglip.modeling_siglip import SiglipAttention
+from vllm_flash_attn import flash_attn_func
+from xformers.ops import memory_efficient_attention
+
+from vllm.config import ModelConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs import LLMInputs
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.multimodal.image import (cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
+from vllm.sequence import SequenceData
+
+
+def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
+    assert image_size % patch_size == 0
+    return image_size // patch_size
+
+
+def get_siglip_num_patches(*, image_size: int, patch_size: int) -> int:
+    grid_length = get_siglip_patch_grid_length(image_size=image_size,
+                                               patch_size=patch_size)
+    return grid_length * grid_length
+
+
+def get_siglip_image_feature_size(hf_config: SiglipVisionConfig) -> int:
+    return get_siglip_num_patches(image_size=hf_config.image_size,
+                                  patch_size=hf_config.patch_size)
+
+
+def get_max_siglip_image_tokens(hf_config: SiglipVisionConfig) -> int:
+    return get_siglip_image_feature_size(hf_config)
+
+
+def dummy_seq_data_for_siglip(
+    hf_config: SiglipVisionConfig,
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_siglip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_image_for_siglip(
+    hf_config: SiglipVisionConfig,
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return {"image": image}
+
+
+def input_processor_for_siglip(
+    model_config: ModelConfig,
+    hf_config: SiglipVisionConfig,
+    llm_inputs: LLMInputs,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    if image_feature_size_override is None:
+        image_feature_size = get_siglip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        tokenizer,
+        llm_inputs.get("prompt"),
+        llm_inputs["prompt_token_ids"],
+        image_token_id=image_token_id,
+        repeat_count=image_feature_size,
+    )
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(
+        prompt_token_ids=new_token_ids,
+        prompt=new_prompt,
+        multi_modal_data=multi_modal_data,
+    )
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches
+        self.position_embedding = VocabParallelEmbedding(
+            self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions, dtype=torch.int64).expand(
+                (1, -1)),
+            persistent=False,
+        )
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int,
+                                 width: int) -> torch.Tensor:
+        """
+        This method is an adapted method for SigLIP (due to SigLIP not having
+        class embedding unlike other ViTs) that allows the model to interpolate
+        the pre-trained position encodings such that it can be usable on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        position_embeddings = self.position_embedding.weight.unsqueeze(0)
+        num_patches = embeddings.shape[1]
+        num_positions = position_embeddings.shape[1]
+        if num_patches == num_positions and height == width:
+            return position_embeddings
+
+        dim = embeddings.shape[-1]
+        height = height // self.patch_size
+        width = width // self.patch_size
+        # we add a small number to avoid floating point error
+        # in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+
+        patch_pos_embed = position_embeddings.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)),
+            dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(
+                height / math.sqrt(num_positions),
+                width / math.sqrt(num_positions),
+            ),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if (int(height) != patch_pos_embed.shape[-2]
+                or int(width) != patch_pos_embed.shape[-1]):
+            raise ValueError("Width or height does not match with "
+                             "the interpolated position embeddings")
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def forward(self,
+                pixel_values: torch.Tensor,
+                interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(
+                embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(
+                self.position_ids)
+        return embeddings
+
+
+# NOTE: Not used - kept for later when we TP the ViT
+# TODO(ChristopherCho): Implement TP version of Attention
+class SiglipTPAttention(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        if self.total_num_heads % tp_size != 0:
+            raise ValueError(
+                f"Number of attention heads ({self.total_num_heads}) "
+                "must be divisible by the tensor model parallel size"
+                f" ({tp_size}).")
+
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = self.embed_dim // self.total_num_heads
+        if self.head_dim * self.total_num_heads != self.embed_dim:
+            raise ValueError(f"embed_dim must be divisible by num_heads (got "
+                             "`embed_dim`: {self.embed_dim} and `num_heads`:"
+                             f" {self.num_heads}).")
+        self.qkv_size = self.num_heads * self.head_dim
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            quant_config=quant_config,
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+        )
+
+        self.attn_fn = self._basic_attention_forward
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """Input shape: Batch x Time x Channel"""
+        batch_size, q_len, _ = hidden_states.size()
+
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = qkv_states.split(
+            [self.qkv_size] * 3, dim=-1)
+
+        attn_output = self.attn_fn(
+            q=query_states,
+            k=key_states,
+            v=value_states,
+            batch_size=batch_size,
+            q_len=q_len,
+        )
+
+        attn_output, _ = self.out_proj(attn_output)
+        return attn_output
+
+    def _basic_attention_forward(self, q, k, v, batch_size, q_len):
+        q = q.view(batch_size, q_len, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, q_len, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, q_len, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = k.shape[-2]
+        attn_weights = torch.matmul(q, k.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (
+                batch_size,
+                self.num_heads,
+                q_len,
+                k_v_seq_len,
+        ):
+            raise ValueError(
+                "Attention weights should be of size "
+                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}")
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights,
+                                             dim=-1,
+                                             dtype=torch.float32).to(q.dtype)
+        attn_weights = nn.functional.dropout(attn_weights,
+                                             p=self.dropout,
+                                             training=self.training)
+        attn_output = torch.matmul(attn_weights, v)
+
+        if attn_output.size() != (
+                batch_size,
+                self.num_heads,
+                q_len,
+                self.head_dim,
+        ):
+            raise ValueError(
+                "`attn_output` should be of size "
+                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        return attn_output
+
+
+# NOTE: Not used - kept for later when we TP the ViT
+# TODO(ChristopherCho): flash_attn_func is not working properly.
+#                       It constantly throws a CUDA error.
+class SiglipFlashAttention2(SiglipTPAttention):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.attn_fn = self._flash_attention_forward
+
+    # Ported from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L449
+    # and https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/modeling_flash_attention_utils.py#L133
+    def _flash_attention_forward(self, q, k, v, batch_size, q_len, *args,
+                                 **kwargs):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the
+                     query, key, and value. (B, S, H, D)
+        """
+
+        q = q.view(batch_size, q_len, self.num_heads, self.head_dim)
+        k = k.view(batch_size, q_len, self.num_heads, self.head_dim)
+        v = v.view(batch_size, q_len, self.num_heads, self.head_dim)
+
+        attn_output = flash_attn_func(
+            q,
+            k,
+            v,
+            dropout_p=self.dropout,
+            causal=False,
+        )
+
+        attn_output = attn_output.reshape(batch_size, q_len,
+                                          self.embed_dim).contiguous()
+
+        return attn_output
+
+
+# NOTE: Not used - kept for later when we TP the ViT
+class SiglipSdpaAttention(SiglipTPAttention):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False
+        self.attn_fn = self._sdpa_attention_forward
+
+    def _sdpa_attention_forward(self, q, k, v, batch_size, q_len):
+        q = q.view(batch_size, q_len, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, q_len, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, q_len, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.dropout, is_causal=False, scale=self.scale)
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, q_len, self.embed_dim)
+
+        return attn_output
+
+
+# NOTE: Not used - kept for later when we TP the ViT
+class SiglipxFormersAttention(SiglipTPAttention):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.attn_fn = self._xformers_attention_forward
+
+    def _xformers_attention_forward(self, q, k, v, batch_size, q_len):
+        q = q.view(batch_size, q_len, self.num_heads, self.head_dim)
+        k = k.view(batch_size, q_len, self.num_heads, self.head_dim)
+        v = v.view(batch_size, q_len, self.num_heads, self.head_dim)
+
+        attn_output = memory_efficient_attention(q,
+                                                 k,
+                                                 v,
+                                                 p=0.0,
+                                                 scale=self.scale)
+        attn_output = attn_output.reshape(batch_size, q_len,
+                                          self.embed_dim).contiguous()
+
+        return attn_output
+
+
+# NOTE: Not used - kept for later when we TP the ViT
+SIGLIP_ATTENTION_CLASSES = {
+    "eager": SiglipTPAttention,
+    "flash_attention_2": SiglipFlashAttention2,
+    "sdpa": SiglipSdpaAttention,
+    "xformers": SiglipxFormersAttention,
+}
+
+
+class SiglipMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+
+        # For quantization, we require the hidden size to be a multiple of 64
+        quantizable = (config.hidden_size % 64 == 0
+                       and config.intermediate_size % 64 == 0)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config if quantizable else None,
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config if quantizable else None,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+
+        # TODO(ChristopherCho): use TP'ed Attention block
+        self.self_attn = SiglipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config,
+            quant_config=quant_config,
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> Tuple[torch.Tensor]:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, None
+
+
+class SiglipEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            SiglipEncoderLayer(
+                config,
+                quant_config=quant_config,
+            ) for _ in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+    ) -> Tuple:
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states, _ = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        # TODO(ChristopherCho): Implement vLLM version of MultiheadAttention
+        self.attention = torch.nn.MultiheadAttention(
+            config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config=config, quant_config=quant_config)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class SiglipVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(
+            config,
+            quant_config=quant_config,
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self.use_head = (True if not hasattr(config, "vision_use_head") else
+                         config.vision_use_head)
+        if self.use_head:
+            self.head = SiglipMultiheadAttentionPoolingHead(
+                config=config, quant_config=quant_config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        interpolate_pos_encoding: bool = True,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+
+        # TODO: add this back when pooled_output is used in inference
+        # if self.use_head:
+        # pooled_output = self.head(last_hidden_state)
+
+        return last_hidden_state
+
+
+class SiglipVisionModel(nn.Module):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.vision_model = SiglipVisionTransformer(
+            config,
+            quant_config,
+        )
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )

From 82a1b1a82b1fbb454c82a9ef95730b929c9b270c Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Mon, 5 Aug 2024 01:46:44 -0700
Subject: [PATCH 164/167] [Speculative decoding] Add periodic log with time
 spent in proposal/scoring/verification (#6963)

---
 tests/spec_decode/test_spec_decode_worker.py | 68 ++++++++++++++------
 vllm/config.py                               |  8 ++-
 vllm/engine/arg_utils.py                     |  1 +
 vllm/spec_decode/spec_decode_worker.py       | 68 ++++++++++++++++----
 vllm/spec_decode/util.py                     | 15 +++++
 5 files changed, 125 insertions(+), 35 deletions(-)

diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 671c9bef294f9..9ae1b4bc40f0f 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -34,8 +34,11 @@ def test_correctly_calls_draft_model(k: int, batch_size: int,
     target_worker = mock_worker()
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     worker = SpecDecodeWorker(
-        draft_worker, target_worker,
-        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
+        draft_worker,
+        target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method),
+        disable_logprobs=False,
+        metrics_collector=metrics_collector)
     exception_secret = 'artificial stop'
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
@@ -74,8 +77,11 @@ def test_correctly_calls_target_model(k: int, batch_size: int,
     set_random_seed(1)
 
     worker = SpecDecodeWorker(
-        draft_worker, target_worker,
-        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
+        draft_worker,
+        target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method),
+        disable_logprobs=False,
+        metrics_collector=metrics_collector)
     worker.init_device()
 
     vocab_size = 32_000
@@ -159,8 +165,11 @@ def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int,
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              spec_decode_sampler,
+                              disable_logprobs=False,
+                              metrics_collector=metrics_collector)
     worker.init_device()
 
     proposal_token_ids = torch.randint(low=0,
@@ -249,8 +258,11 @@ def test_correctly_formats_output(k: int, batch_size: int,
 
     set_random_seed(1)
     spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
-    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
-                              metrics_collector)
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              spec_decode_sampler,
+                              disable_logprobs=False,
+                              metrics_collector=metrics_collector)
     worker.init_device()
 
     proposal_token_ids = torch.randint(low=0,
@@ -479,9 +491,13 @@ def test_k_equals_zero(k: int, batch_size: int,
     set_random_seed(1)
 
     worker = SpecDecodeWorker(
-        draft_worker, target_worker,
-        mock_spec_decode_sampler(acceptance_sampler_method), False,
-        metrics_collector)
+        proposer_worker=draft_worker,
+        scorer_worker=target_worker,
+        spec_decode_sampler=mock_spec_decode_sampler(
+            acceptance_sampler_method),
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+    )
 
     seq_group_metadata_list, _, _ = create_batch(batch_size,
                                                  k,
@@ -526,9 +542,13 @@ def test_empty_input_batch(k: int, batch_size: int,
     set_random_seed(1)
 
     worker = SpecDecodeWorker(
-        draft_worker, target_worker,
-        mock_spec_decode_sampler(acceptance_sampler_method), False,
-        metrics_collector)
+        proposer_worker=draft_worker,
+        scorer_worker=target_worker,
+        spec_decode_sampler=mock_spec_decode_sampler(
+            acceptance_sampler_method),
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+    )
 
     seq_group_metadata_list, _, _ = create_batch(batch_size,
                                                  k,
@@ -560,8 +580,13 @@ def test_init_device(acceptance_sampler_method: str):
     spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
-                              False, metrics_collector)
+    worker = SpecDecodeWorker(
+        proposer_worker=draft_worker,
+        scorer_worker=target_worker,
+        spec_decode_sampler=spec_decode_sampler,
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+    )
     worker.init_device()
 
     draft_worker.init_device.assert_called_once()
@@ -583,9 +608,11 @@ def test_initialize_cache(acceptance_sampler_method):
     target_worker = mock_worker()
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(
-        draft_worker, target_worker,
-        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
+    worker = SpecDecodeWorker(proposer_worker=draft_worker,
+                              scorer_worker=target_worker,
+                              spec_decode_sampler=mock_spec_decode_sampler(
+                                  acceptance_sampler_method),
+                              metrics_collector=metrics_collector)
 
     kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023}
     worker.initialize_cache(**kwargs)
@@ -725,7 +752,8 @@ def test_populate_seq_ids_with_bonus_tokens():
         seq_group_metadata_list=seq_group_metadata_list,
         accepted_token_ids=accepted_token_ids,
         target_logprobs=target_token_logprobs,
-        k=k)
+        k=k,
+        stage_times=(0, 0, 0))
     # Verify that _seq_with_bonus_token_in_last_step contains the following:
     # 1. Sequence IDs that were already present in
     #    _seq_with_bonus_token_in_last_step but were not part of the current
diff --git a/vllm/config.py b/vllm/config.py
index 35945e34452d2..bec0b63197ef4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -907,6 +907,7 @@ def maybe_create_spec_config(
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
         use_v2_block_manager: bool,
+        disable_log_stats: bool,
         speculative_disable_by_batch_size: Optional[int],
         ngram_prompt_lookup_max: Optional[int],
         ngram_prompt_lookup_min: Optional[int],
@@ -1095,7 +1096,8 @@ def maybe_create_spec_config(
                 typical_acceptance_sampler_posterior_threshold,
             typical_acceptance_sampler_posterior_alpha=\
                 typical_acceptance_sampler_posterior_alpha,
-            disable_logprobs=disable_logprobs
+            disable_logprobs=disable_logprobs,
+            disable_log_stats=disable_log_stats,
         )
 
     @staticmethod
@@ -1189,6 +1191,7 @@ def __init__(
         typical_acceptance_sampler_posterior_threshold: float,
         typical_acceptance_sampler_posterior_alpha: float,
         disable_logprobs: bool,
+        disable_log_stats: bool,
     ):
         """Create a SpeculativeConfig object.
 
@@ -1221,6 +1224,8 @@ def __init__(
                 sampling, target sampling, and after accepted tokens are
                 determined. If set to False, log probabilities will be
                 returned.
+            disable_log_stats: Whether to disable periodic printing of stage
+                times in speculative decoding.
         """
         self.draft_model_config = draft_model_config
         self.draft_parallel_config = draft_parallel_config
@@ -1235,6 +1240,7 @@ def __init__(
         self.typical_acceptance_sampler_posterior_alpha = \
             typical_acceptance_sampler_posterior_alpha
         self.disable_logprobs = disable_logprobs
+        self.disable_log_stats = disable_log_stats
 
         self._verify_args()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2737b50927f6b..acc0551af0154 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -792,6 +792,7 @@ def create_engine_config(self, ) -> EngineConfig:
             speculative_max_model_len=self.speculative_max_model_len,
             enable_chunked_prefill=self.enable_chunked_prefill,
             use_v2_block_manager=self.use_v2_block_manager,
+            disable_log_stats=self.disable_log_stats,
             ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
             ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
             draft_token_acceptance_method=\
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index ad8c0cee0b5b6..690aad505e215 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -27,7 +27,7 @@
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker
 from vllm.spec_decode.target_model_runner import TargetModelRunner
-from vllm.spec_decode.util import (create_sequence_group_output,
+from vllm.spec_decode.util import (Timer, create_sequence_group_output,
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
@@ -75,7 +75,9 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
         typical_acceptance_sampler_posterior_threshold,
         typical_acceptance_sampler_posterior_alpha=speculative_config.
         typical_acceptance_sampler_posterior_alpha,
-        disable_logprobs=speculative_config.disable_logprobs)
+        disable_logprobs=speculative_config.disable_logprobs,
+        disable_log_stats=speculative_config.disable_log_stats,
+    )
 
     return spec_decode_worker
 
@@ -116,6 +118,7 @@ def create_worker(
         typical_acceptance_sampler_posterior_threshold: float,
         typical_acceptance_sampler_posterior_alpha: float,
         disable_logprobs: bool,
+        disable_log_stats: bool,
     ) -> "SpecDecodeWorker":
 
         allow_zero_draft_token_step = True
@@ -171,6 +174,7 @@ def create_worker(
             proposer_worker,
             scorer_worker,
             disable_logprobs=disable_logprobs,
+            disable_log_stats=disable_log_stats,
             disable_by_batch_size=disable_by_batch_size,
             spec_decode_sampler=spec_decode_sampler,
             allow_zero_draft_token_step=allow_zero_draft_token_step)
@@ -180,7 +184,8 @@ def __init__(
         proposer_worker: ProposerWorkerBase,
         scorer_worker: WorkerBase,
         spec_decode_sampler: SpecDecodeBaseSampler,
-        disable_logprobs: bool,
+        disable_logprobs: bool = False,
+        disable_log_stats: bool = False,
         metrics_collector: Optional[AsyncMetricsCollector] = None,
         disable_by_batch_size: Optional[int] = None,
         allow_zero_draft_token_step: Optional[bool] = True,
@@ -203,6 +208,8 @@ def __init__(
             disable_logprobs: If set to True, token log probabilities will
                 not be output in both the draft worker and the target worker.
                 If set to False, log probabilities will be output by both.
+            disable_log_stats: If set to True, disable periodic printing of
+                speculative stage times.
             disable_by_batch_size: If the batch size is larger than this,
                 disable speculative decoding for new incoming requests.
             metrics_collector: Helper class for collecting metrics; can be set
@@ -240,6 +247,7 @@ def __init__(
         # in the subsequent step.
         self.previous_hidden_states: Optional[HiddenStates] = None
         self._disable_logprobs = disable_logprobs
+        self._disable_log_stats = disable_log_stats
 
     def init_device(self) -> None:
         """Initialize both scorer and proposer models.
@@ -525,28 +533,37 @@ def _run_speculative_decoding_step(
         execute_model_req.previous_hidden_states = self.previous_hidden_states
         self.previous_hidden_states = None
 
-        # Generate proposals using draft worker.
-        proposals = self.proposer_worker.get_spec_proposals(
-            execute_model_req, self._seq_with_bonus_token_in_last_step)
+        with Timer() as proposal_timer:
+            # Generate proposals using draft worker.
+            proposals = self.proposer_worker.get_spec_proposals(
+                execute_model_req, self._seq_with_bonus_token_in_last_step)
 
         if not self._allow_zero_draft_token_step and proposals.no_proposals:
             #TODO: Fix it #5814
             raise RuntimeError("Cannot handle cases where distributed draft "
                                "workers generate no tokens")
 
-        proposal_scores = self.scorer.score_proposals(
-            execute_model_req,
-            proposals,
-        )
-        accepted_token_ids, target_logprobs = self._verify_tokens(
-            execute_model_req.seq_group_metadata_list, proposal_scores,
-            proposals, execute_model_req.num_lookahead_slots)
+        with Timer() as scoring_timer:
+            proposal_scores = self.scorer.score_proposals(
+                execute_model_req,
+                proposals,
+            )
+
+        with Timer() as verification_timer:
+            accepted_token_ids, target_logprobs = self._verify_tokens(
+                execute_model_req.seq_group_metadata_list, proposal_scores,
+                proposals, execute_model_req.num_lookahead_slots)
+
+        stage_times = (proposal_timer.elapsed_time_ms / num_lookahead_slots,
+                       scoring_timer.elapsed_time_ms,
+                       verification_timer.elapsed_time_ms)
 
         return self._create_output_sampler_list(
             execute_model_req.seq_group_metadata_list,
             accepted_token_ids,
             target_logprobs=target_logprobs,
-            k=execute_model_req.num_lookahead_slots)
+            k=execute_model_req.num_lookahead_slots,
+            stage_times=stage_times)
 
     @nvtx_range("spec_decode_worker._verify_tokens")
     def _verify_tokens(
@@ -645,6 +662,7 @@ def _create_output_sampler_list(
         accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
         target_logprobs: torch.Tensor,  # shape: [batch_size, k+1, vocab_size]
         k: int,
+        stage_times: Tuple[float, float, float],
     ) -> List[SamplerOutput]:
         """Given the accepted token ids, create a list of SamplerOutput.
 
@@ -722,8 +740,30 @@ def _create_output_sampler_list(
         if maybe_rejsample_metrics is not None:
             sampler_output_list[
                 0].spec_decode_worker_metrics = maybe_rejsample_metrics
+
+            # Log time spent in each stage periodically.
+            # This is periodic because the rejection sampler emits metrics
+            # periodically.
+            self._maybe_log_stage_times(*stage_times)
+
         return sampler_output_list
 
+    def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float,
+                               scoring_time_ms: float,
+                               verification_time_ms: float) -> None:
+        """Log the speculative stage times. If stat logging is disabled, do
+        nothing.
+        """
+        if self._disable_log_stats:
+            return
+
+        logger.info(
+            "SpecDecodeWorker stage times: "
+            "average_time_per_proposal_tok_ms=%.02f "
+            "scoring_time_ms=%.02f verification_time_ms=%.02f",
+            average_time_per_proposal_tok_ms, scoring_time_ms,
+            verification_time_ms)
+
     def _create_dummy_logprob_lists(
         self,
         batch_size: int,
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index ade546eef264e..c6223a97dba10 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -1,3 +1,4 @@
+import time
 from contextlib import contextmanager
 from typing import Dict, List, Optional, Tuple
 
@@ -214,3 +215,17 @@ def nvtx_range(msg, *args, **kwargs):
         yield
     finally:
         torch.cuda.nvtx.range_pop()
+
+
+class Timer:
+    """Basic timer context manager for measuring CPU time.
+    """
+
+    def __enter__(self):
+        self.start_time = time.time()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.end_time = time.time()
+        self.elapsed_time_s = self.end_time - self.start_time
+        self.elapsed_time_ms = self.elapsed_time_s * 1000

From e9630458c7b11732e147c120817c53420280d471 Mon Sep 17 00:00:00 2001
From: Bongwon Jang <152451401+bong-furiosa@users.noreply.github.com>
Date: Tue, 6 Aug 2024 00:05:05 +0900
Subject: [PATCH 165/167] [SpecDecode] Support FlashInfer in DraftModelRunner
 (#6926)

---
 vllm/spec_decode/draft_model_runner.py | 47 ++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 0b755600ae824..b76a1ab4cf243 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -11,6 +11,17 @@
     from vllm.attention.backends.rocm_flash_attn import (
         ROCmFlashAttentionMetadata as FlashAttentionMetadata)
 
+try:
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+except ImportError:
+    BatchDecodeWithPagedKVCacheWrapper = None
+    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+    BatchPrefillWithPagedKVCacheWrapper = None
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
@@ -79,6 +90,11 @@ def __init__(
             return_hidden_states=return_hidden_states,
         )
 
+        self.flashinfer_decode_workspace_buffer = None
+        self.flashinfer_decode_wrapper = None
+        self.flashinfer_prefill_workspace_buffer = None
+        self.flashinfer_prefill_wrapper = None
+
     def _update_flash_attn_metadata(self, attn_metadata, num_seqs,
                                     num_queries):
         assert isinstance(attn_metadata, FlashAttentionMetadata)
@@ -286,6 +302,37 @@ def execute_model(
                     model_input.prompt_adapter_requests,
                     model_input.prompt_adapter_mapping)
 
+            if self.attn_backend.get_name() == "flashinfer":
+                assert model_input.attn_metadata is not None
+                assert model_input.input_tokens is not None
+                if self.flashinfer_decode_workspace_buffer is None:
+                    self.flashinfer_decode_workspace_buffer = torch.empty(
+                        FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                        dtype=torch.uint8,
+                        device=self.device)
+                    self.flashinfer_decode_wrapper = \
+                        BatchDecodeWithPagedKVCacheWrapper(
+                        self.flashinfer_decode_workspace_buffer, "NHD")
+                    self.flashinfer_prefill_workspace_buffer = torch.empty(
+                        FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                        dtype=torch.uint8,
+                        device=self.device)
+                    self.flashinfer_prefill_wrapper = \
+                        BatchPrefillWithPagedKVCacheWrapper(
+                        self.flashinfer_prefill_workspace_buffer, "NHD")
+
+                model_input.attn_metadata.prefill_wrapper = \
+                    self.flashinfer_prefill_wrapper
+                if model_input.attn_metadata.use_cuda_graph:
+                    batch_size = model_input.input_tokens.shape[0]
+                    model_input.attn_metadata.decode_wrapper = \
+                        self.graph_runners[model_input.
+                        virtual_engine][batch_size].flashinfer_decode_wrapper
+                else:
+                    model_input.attn_metadata.decode_wrapper = \
+                        self.flashinfer_decode_wrapper
+                model_input.attn_metadata.begin_forward()
+
         # Detect exec mode
         assert model_input.attn_metadata is not None
         use_cuda_graph = False

From 003f8ee1287f90a7e8aa9b9e7d6246ac74ebefbe Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Mon, 5 Aug 2024 08:41:03 -0700
Subject: [PATCH 166/167] [BugFix] Use IP4 localhost form for zmq bind (#7163)

---
 vllm/entrypoints/openai/rpc/server.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py
index 7a72a6f732c99..60bb23b9bde05 100644
--- a/vllm/entrypoints/openai/rpc/server.py
+++ b/vllm/entrypoints/openai/rpc/server.py
@@ -30,7 +30,9 @@ def __init__(self, async_engine_args: AsyncEngineArgs,
 
         # Init socket for readiness state.
         self.socket = self.context.socket(zmq.constants.ROUTER)
-        self.socket.bind(f"tcp://localhost:{port}")
+        # Note numeric form of localhost should be used for zmq bind(),
+        # see https://stackoverflow.com/a/8958414
+        self.socket.bind(f"tcp://127.0.0.1:{port}")
 
     def cleanup(self):
         """Cleanup all resources."""

From f1575d9fd940cbc7479cca8c064567b130cf67e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 5 Aug 2024 18:17:52 +0200
Subject: [PATCH 167/167] Dockerfile.ubi: bump flashinfer

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 4462ce8a59c21..363c634fbdafc 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -165,7 +165,7 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/
     && make install
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl
+    pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu124torch2.4-cp311-cp311-linux_x86_64.whl
 
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \