From 08287ef6751e79a89bf4f060f5f9545560a6de12 Mon Sep 17 00:00:00 2001
From: Kyle Mistele <kyle@mistele.com>
Date: Mon, 9 Sep 2024 09:45:11 -0500
Subject: [PATCH 1/7] [Bugfix] Streamed tool calls now more strictly follow
 OpenAI's format; ensures Vercel AI SDK compatibility (#8272)

---
 tests/tool_use/utils.py                       |  2 +-
 vllm/entrypoints/openai/protocol.py           |  7 -----
 vllm/entrypoints/openai/serving_chat.py       |  6 ++++-
 .../tool_parsers/abstract_tool_parser.py      |  1 -
 .../openai/tool_parsers/hermes_tool_parser.py | 20 ++++----------
 .../tool_parsers/mistral_tool_parser.py       | 27 ++++++-------------
 6 files changed, 19 insertions(+), 44 deletions(-)

diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 8ec9b05b2c521..e447469e33410 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -19,7 +19,7 @@ class ServerConfig(TypedDict):
 CONFIGS: Dict[str, ServerConfig] = {
     "hermes": {
         "model":
-        "NousResearch/Hermes-2-Pro-Llama-3-8B",
+        "NousResearch/Hermes-3-Llama-3.1-8B",
         "arguments": [
             "--tool-call-parser", "hermes", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 970262a4bd358..374196044b7e8 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -713,13 +713,6 @@ class DeltaToolCall(OpenAIBaseModel):
     function: Optional[DeltaFunctionCall] = None
 
 
-# the initial delta that gets sent once a new tool call is started;
-class InitialDeltaToolCall(DeltaToolCall):
-    id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
-    type: Literal["function"] = "function"
-    index: int
-
-
 class ExtractedToolCallInformation(BaseModel):
     # indicate if tools were called
     tools_called: bool
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 78f355228012f..8ed81e9c88cb2 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -271,9 +271,13 @@ async def chat_completion_stream_generator(
                     # NOTE num_choices defaults to 1 so this usually executes
                     # once per request
                     for i in range(num_choices):
+
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
-                            delta=DeltaMessage(role=role),
+                            delta=DeltaMessage(
+                                role=role,
+                                content="",
+                            ),
                             logprobs=None,
                             finish_reason=None)
                         chunk = ChatCompletionStreamResponse(
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index b0807e6f1e782..873f615d43257 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -20,7 +20,6 @@ def __init__(self, tokenizer: AnyTokenizer):
         # the index of the tool call that is currently being parsed
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.current_tool_initial_sent: bool = False
         self.streamed_args_for_tool: List[str] = []
 
         self.model_tokenizer = tokenizer
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 7afbca7162edf..bde9b47ce60d5 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -8,14 +8,14 @@
 from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
                                               ExtractedToolCallInformation,
-                                              FunctionCall,
-                                              InitialDeltaToolCall, ToolCall)
+                                              FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser)
 from vllm.entrypoints.openai.tool_parsers.utils import (
     extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -34,7 +34,6 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.prev_tool_call_arr: List[Dict] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent = False
-        self.current_tool_initial_sent: bool = False
         self.streamed_args_for_tool: List[str] = [
         ]  # map what has been streamed for each tool so far to a list
 
@@ -168,7 +167,6 @@ def extract_tool_calls_streaming(
                 # set cursors and state appropriately
                 self.current_tool_id += 1
                 self.current_tool_name_sent = False
-                self.current_tool_initial_sent = False
                 self.streamed_args_for_tool.append("")
                 logger.debug("Starting on a new tool %s", self.current_tool_id)
 
@@ -218,24 +216,16 @@ def extract_tool_calls_streaming(
                 logger.debug('not enough tokens to parse into JSON yet')
                 return None
 
-            # case - we haven't sent the initial delta with the tool call ID
-            #   (it will be sent)
-            if not self.current_tool_initial_sent:
-                self.current_tool_initial_sent = True
-                return DeltaMessage(tool_calls=[
-                    InitialDeltaToolCall(
-                        index=self.current_tool_id).model_dump(
-                            exclude_none=True)
-                ])
-
             # case - we haven't sent the tool name yet. If it's available, send
             #   it. otherwise, wait until it's available.
-            elif not self.current_tool_name_sent:
+            if not self.current_tool_name_sent:
                 function_name: Union[str, None] = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
                     return DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=f"chatcmpl-tool-{random_uuid()}",
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index d48770c792e98..4b0e1c91df97c 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -8,14 +8,14 @@
 from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
                                               ExtractedToolCallInformation,
-                                              FunctionCall,
-                                              InitialDeltaToolCall, ToolCall)
+                                              FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser)
 from vllm.entrypoints.openai.tool_parsers.utils import (
     extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -25,7 +25,7 @@ class MistralToolParser(ToolParser):
     Tool call parser for Mistral 7B Instruct v0.3, intended for use with the
     examples/tool_chat_template_mistral.jinja template.
 
-    Used when --enable-auto-tool-choice --tool-call-parser gmistral are all set
+    Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
     """
 
     def __init__(self, tokenizer: AnyTokenizer):
@@ -42,7 +42,6 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.prev_tool_call_arr: List[Dict] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.current_tool_initial_sent: bool = False
         self.streamed_args_for_tool: List[str] = [
         ]  # map what has been streamed for each tool so far to a list
         self.bot_token = "[TOOL_CALLS]"
@@ -91,7 +90,6 @@ def extract_tool_calls(self,
 
         except Exception as e:
             logger.error("Error in extracting tool call from response: %s", e)
-            print("ERROR", e)
             # return information to just treat the tool call as regular JSON
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
@@ -109,7 +107,7 @@ def extract_tool_calls_streaming(
 
         # if the tool call token is not in the tokens generated so far, append
         # output to contents since it's not a tool
-        if self.bot_token_id not in current_token_ids:
+        if self.bot_token not in current_text:
             return DeltaMessage(content=delta_text)
 
         # if the tool call token ID IS in the tokens generated so far, that
@@ -134,7 +132,7 @@ def extract_tool_calls_streaming(
             # replace BOT token with empty string, and convert single quotes
             # to double to allow parsing as JSON since mistral uses single
             # quotes instead of double for tool calls
-            parsable_arr = current_text.split(self.bot_token)[1]
+            parsable_arr = current_text.split(self.bot_token)[-1]
 
             # tool calls are generated in an array, so do partial JSON
             # parsing on the entire array
@@ -186,31 +184,22 @@ def extract_tool_calls_streaming(
                 # re-set stuff pertaining to progress in the current tool
                 self.current_tool_id = len(tool_call_arr) - 1
                 self.current_tool_name_sent = False
-                self.current_tool_initial_sent = False
                 self.streamed_args_for_tool.append("")
                 logger.debug("starting on new tool %d", self.current_tool_id)
                 return delta
 
             # case: update an existing tool - this is handled below
 
-            # if the current tool initial data incl. the id, type=function
-            # and idx not sent, send that
-            if not self.current_tool_initial_sent:
-                self.current_tool_initial_sent = True
-                delta = DeltaMessage(tool_calls=[
-                    InitialDeltaToolCall(
-                        index=self.current_tool_id).model_dump(
-                            exclude_none=True)
-                ])
-
             # if the current tool name hasn't been sent, send if available
             # - otherwise send nothing
-            elif not self.current_tool_name_sent:
+            if not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
                 if function_name:
 
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=f"chatcmpl-tool-{random_uuid()}",
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))

From 58fcc8545a149c9c5b1f91f417a68f5ba1fdabf3 Mon Sep 17 00:00:00 2001
From: Adam Lugowski <alugowski@gmail.com>
Date: Mon, 9 Sep 2024 11:16:37 -0700
Subject: [PATCH 2/7] [Frontend] Add progress reporting to run_batch.py (#8060)

Co-authored-by: Adam Lugowski <adam.lugowski@parasail.io>
---
 vllm/entrypoints/openai/run_batch.py | 54 ++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 32bbade256973..278be8cd11a12 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,9 +1,11 @@
 import asyncio
 from io import StringIO
-from typing import Awaitable, Callable, List
+from typing import Awaitable, Callable, List, Optional
 
 import aiohttp
+import torch
 from prometheus_client import start_http_server
+from tqdm import tqdm
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -78,6 +80,38 @@ def parse_args():
     return parser.parse_args()
 
 
+# explicitly use pure text format, with a newline at the end
+# this makes it impossible to see the animation in the progress bar
+# but will avoid messing up with ray or multiprocessing, which wraps
+# each line of output with some prefix.
+_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
+
+
+class BatchProgressTracker:
+
+    def __init__(self):
+        self._total = 0
+        self._pbar: Optional[tqdm] = None
+
+    def submitted(self):
+        self._total += 1
+
+    def completed(self):
+        if self._pbar:
+            self._pbar.update()
+
+    def pbar(self) -> tqdm:
+        enable_tqdm = not torch.distributed.is_initialized(
+        ) or torch.distributed.get_rank() == 0
+        self._pbar = tqdm(total=self._total,
+                          unit="req",
+                          desc="Running batch",
+                          mininterval=5,
+                          disable=not enable_tqdm,
+                          bar_format=_BAR_FORMAT)
+        return self._pbar
+
+
 async def read_file(path_or_url: str) -> str:
     if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
         async with aiohttp.ClientSession() as session, \
@@ -102,7 +136,8 @@ async def write_file(path_or_url: str, data: str) -> None:
 
 
 async def run_request(serving_engine_func: Callable,
-                      request: BatchRequestInput) -> BatchRequestOutput:
+                      request: BatchRequestInput,
+                      tracker: BatchProgressTracker) -> BatchRequestOutput:
     response = await serving_engine_func(request.body)
 
     if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)):
@@ -125,6 +160,7 @@ async def run_request(serving_engine_func: Callable,
     else:
         raise ValueError("Request must not be sent in stream mode")
 
+    tracker.completed()
     return batch_output
 
 
@@ -164,6 +200,9 @@ async def main(args):
         request_logger=request_logger,
     )
 
+    tracker = BatchProgressTracker()
+    logger.info("Reading batch from %s...", args.input_file)
+
     # Submit all requests in the file to the engine "concurrently".
     response_futures: List[Awaitable[BatchRequestOutput]] = []
     for request_json in (await read_file(args.input_file)).strip().split("\n"):
@@ -178,16 +217,19 @@ async def main(args):
         if request.url == "/v1/chat/completions":
             response_futures.append(
                 run_request(openai_serving_chat.create_chat_completion,
-                            request))
+                            request, tracker))
+            tracker.submitted()
         elif request.url == "/v1/embeddings":
             response_futures.append(
-                run_request(openai_serving_embedding.create_embedding,
-                            request))
+                run_request(openai_serving_embedding.create_embedding, request,
+                            tracker))
+            tracker.submitted()
         else:
             raise ValueError("Only /v1/chat/completions and /v1/embeddings are"
                              "supported in the batch endpoint.")
 
-    responses = await asyncio.gather(*response_futures)
+    with tracker.pbar():
+        responses = await asyncio.gather(*response_futures)
 
     output_buffer = StringIO()
     for response in responses:

From f9b4a2d41587da0692d32797221df55a02d890a6 Mon Sep 17 00:00:00 2001
From: Vladislav Kruglikov <vladislavkruglikov@outlook.com>
Date: Mon, 9 Sep 2024 21:20:46 +0300
Subject: [PATCH 3/7] [Bugfix] Correct adapter usage for cohere and jamba
 (#8292)

---
 vllm/model_executor/models/commandr.py | 5 +++--
 vllm/model_executor/models/jamba.py    | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index be7f19d15b623..649dc798d22dc 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -47,6 +47,8 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsLoRA
+
 
 @torch.compile
 def layer_norm_func(hidden_states, weight, variance_epsilon):
@@ -292,8 +294,7 @@ def forward(
         return hidden_states
 
 
-class CohereForCausalLM(nn.Module):
-
+class CohereForCausalLM(nn.Module, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 73be7ffed0f89..29dd09afac5ad 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -38,6 +38,8 @@
 from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
                                       _get_graph_batch_size)
 
+from .interfaces import SupportsLoRA
+
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
@@ -539,7 +541,7 @@ def forward(
         return hidden_states
 
 
-class JambaForCausalLM(nn.Module, HasInnerState):
+class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From c7cb5c333564cb00fc4f6a99d32c35e9ebc0f1ed Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 9 Sep 2024 16:27:26 -0400
Subject: [PATCH 4/7] [Misc] GPTQ Activation Ordering (#8135)

---
 tests/weight_loading/models.txt               |  1 +
 .../compressed_tensors/compressed_tensors.py  |  3 +-
 .../schemes/compressed_tensors_wNa16.py       | 45 ++++++++++++++-----
 .../quantization/compressed_tensors/utils.py  | 30 ++++++++++++-
 4 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 1dc529037a98e..c708e6d5eb897 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,6 +21,7 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 0768b37044aac..1170d55f31993 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -232,7 +232,8 @@ def _get_scheme_from_parts(
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
-                    group_size=weight_quant.group_size)
+                    group_size=weight_quant.group_size,
+                    actorder=weight_quant.actorder)
 
         # Detect If Activation Quantization.
         # TODO @dsikka: clean-up conditions
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 7ca8eecb9283e..8897737c1c55a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -5,14 +5,18 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    ActivationOrdering)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
-    marlin_permute_scales, replace_tensor, verify_marlin_supported,
+    marlin_permute_scales, marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx, replace_tensor, verify_marlin_supported,
     verify_marlin_supports_shape)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
 from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsWNA16"]
@@ -28,11 +32,13 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
     def __init__(self,
                  strategy: str,
                  num_bits: int,
-                 group_size: Optional[int] = None):
+                 group_size: Optional[int] = None,
+                 actorder: Optional[ActivationOrdering] = None):
 
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
         self.group_size = -1 if group_size is None else group_size
+        self.has_g_idx = actorder == ActivationOrdering.GROUP
 
         if self.group_size == -1 and self.strategy != "channel":
             raise ValueError("Marlin kernels require group quantization or "
@@ -64,12 +70,10 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         output_size_per_partition = sum(output_partition_sizes)
 
         # If group_size is -1, we are in channelwise case.
-        channelwise = (self.group_size == -1)
         group_size = self.group_size if self.group_size != -1 else input_size
         row_parallel = (input_size != input_size_per_partition)
-        # In the case of channelwise quantization, we need to replicate the
-        # scales across all gpus.
-        partition_scales = (row_parallel and not channelwise)
+        partition_scales = not marlin_repeat_scales_on_all_ranks(
+            self.has_g_idx, self.group_size, row_parallel)
 
         verify_marlin_supports_shape(
             output_size_per_partition=output_size_per_partition,
@@ -123,6 +127,16 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
 
+        # group index (for activation reordering)
+        if self.has_g_idx:
+            weight_g_idx = RowvLLMParameter(data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+                                            input_dim=0,
+                                            weight_loader=weight_loader)
+            layer.register_parameter("weight_g_idx", weight_g_idx)
+
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
@@ -137,9 +151,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.workspace = marlin_make_workspace(
             layer.output_size_per_partition, device)
 
-        # Act-order not supported in compressed-tensors yet, so set to empty.
-        layer.g_idx = marlin_make_empty_g_idx(device)
-        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+        # Handle sorting for activation reordering if needed.
+        if self.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.weight_g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+            replace_tensor(layer, "weight_g_idx", g_idx)
+        else:
+            layer.weight_g_idx = marlin_make_empty_g_idx(device)
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
 
         # No zero-point
         layer.weight_zp = marlin_make_empty_g_idx(device)
@@ -159,9 +178,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         replace_tensor(layer, "weight_packed", marlin_qweight)
 
         # Permute scales from compressed-tensors format to marlin format.
+        # scale is required on all partitions if activation reordering
         marlin_scales = marlin_permute_scales(
             layer.weight_scale,
-            size_k=layer.input_size_per_partition,
+            size_k=(layer.input_size
+                    if self.has_g_idx else layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
             group_size=layer.group_size)
         replace_tensor(layer, "weight_scale", marlin_scales)
@@ -174,7 +195,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
             weight=layer.weight_packed,
             weight_scale=layer.weight_scale,
             weight_zp=layer.weight_zp,
-            g_idx=layer.g_idx,
+            g_idx=layer.weight_g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
             wtype=self.quant_type,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 7912cbde5721f..fc531b9d666e3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,8 +1,8 @@
 import re
 from enum import Enum
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable, Optional, Union
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 from torch.nn import Module
 
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -40,6 +40,19 @@ class QuantizationStrategy(str, Enum):
     TOKEN = "token"
 
 
+class ActivationOrdering(str, Enum):
+    """
+    Enum storing strategies for activation ordering
+
+    Group: reorder groups and weight\n
+    Weight: only reorder weight, not groups. Slightly lower latency and
+    accuracy compared to group actorder\n
+    """
+
+    GROUP = "group"
+    WEIGHT = "weight"
+
+
 class QuantizationArgs(BaseModel):
     """
     User facing arguments used to define a quantization config 
@@ -58,6 +71,8 @@ class QuantizationArgs(BaseModel):
         observed with every sample. Defaults to False for static
         quantization. Note that enabling dynamic quantization 
         will change the default observer to a memoryless one
+    :param actorder: whether to apply group quantization in decreasing order of
+        activation. Defaults to None for arbitrary ordering
     """
 
     num_bits: int = 8
@@ -67,6 +82,7 @@ class QuantizationArgs(BaseModel):
     strategy: Optional[QuantizationStrategy] = None
     block_structure: Optional[str] = None
     dynamic: bool = False
+    actorder: Union[ActivationOrdering, bool, None] = None
     observer: str = Field(
         default="minmax",
         description=("The class to use to compute the quantization param - "
@@ -79,6 +95,16 @@ class QuantizationArgs(BaseModel):
          "Observers constructor excluding quantization range or symmetry"),
     )
 
+    @field_validator("actorder", mode="before")
+    def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
+        if isinstance(value, bool):
+            return ActivationOrdering.GROUP if value else None
+
+        if isinstance(value, str):
+            return ActivationOrdering(value.lower())
+
+        return value
+
 
 def is_activation_quantization_format(format: str) -> bool:
     _ACTIVATION_QUANTIZATION_FORMATS = [

From 6cd5e5b07e4415d064d93b8a66331a097bd9287e Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 9 Sep 2024 23:02:52 -0400
Subject: [PATCH 5/7] [Misc] Fused MoE Marlin support for GPTQ (#8217)

---
 .buildkite/test-pipeline.yaml                 |  13 +-
 csrc/moe/marlin_moe_ops.cu                    |   2 +-
 csrc/moe/marlin_moe_ops.h                     |   2 +-
 csrc/moe/torch_bindings.cpp                   |   1 -
 tests/kernels/test_moe.py                     | 221 ++++++++++++-
 tests/weight_loading/models-large.txt         |   3 +
 tests/weight_loading/models.txt               |   2 -
 .../layers/fused_moe/__init__.py              |  14 +-
 .../layers/fused_moe/fused_marlin_moe.py      | 219 ++++++++++++
 .../layers/fused_moe/fused_moe.py             | 138 ++------
 vllm/model_executor/layers/fused_moe/layer.py |  75 +++--
 .../compressed_tensors_moe.py                 |  48 +--
 .../schemes/compressed_tensors_wNa16.py       |   2 +-
 .../layers/quantization/gptq_marlin.py        | 312 +++++++++++++++++-
 .../layers/quantization/utils/marlin_utils.py |  17 +
 .../quantization/utils/marlin_utils_test.py   |  11 +-
 .../layers/quantization/utils/quant_utils.py  |  19 +-
 vllm/model_executor/model_loader/utils.py     |   8 +
 vllm/model_executor/models/mixtral.py         |   9 +-
 19 files changed, 912 insertions(+), 204 deletions(-)
 create mode 100644 tests/weight_loading/models-large.txt
 create mode 100644 vllm/model_executor/layers/fused_moe/fused_marlin_moe.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d0317b2fc48c9..a0c7b7442b3b3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -386,7 +386,18 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
 
 
 ##### multi gpus test #####
diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index 1e170e80d2f70..92184f43c9eb0 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -1737,4 +1737,4 @@ torch::Tensor marlin_gemm_moe(
       moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
       thread_n, sms, max_par, replicate_input, apply_weights);
   return c;
-}
\ No newline at end of file
+}
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index 01ba8ff69850d..43d264e0770d6 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -9,4 +9,4 @@ torch::Tensor marlin_gemm_moe(
     const torch::Tensor& g_idx, const torch::Tensor& perm,
     torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
     bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
-    bool replicate_input, bool apply_weights);
\ No newline at end of file
+    bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d4d43e2c601b5..8a0e625b43fa1 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
       "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
       "bool replicate_input, bool apply_weights) -> Tensor");
-
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 #endif
 }
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index f526c381b3339..2250cf1598b8b 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -2,6 +2,8 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+from typing import List
+
 import pytest
 import torch
 from transformers import MixtralConfig
@@ -9,7 +11,13 @@
 
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe, single_marlin_moe)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
+from vllm.scalar_type import scalar_types
 
 
 def torch_moe(a, w1, w2, score, topk):
@@ -29,6 +37,20 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
+def torch_moe_single(a, w, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    _, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = a[mask] @ w[i].transpose(0, 1)
+    return (out.view(B, -1, w.shape[1])).sum(dim=1)
+
+
 @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
@@ -43,11 +65,11 @@ def test_fused_moe(
     topk: int,
     dtype: torch.dtype,
 ):
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
     triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
     torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
@@ -99,3 +121,194 @@ def test_mixtral_moe(dtype: torch.dtype):
                                vllm_states,
                                rtol=mixtral_moe_tol[dtype],
                                atol=mixtral_moe_tol[dtype])
+
+
+def stack_and_dev(tensors: List[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+):
+    torch.manual_seed(7)
+
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+
+    quant_type = scalar_types.uint4b8
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    for i in range(w2.shape[0]):
+        w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
+
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+            w1[i].transpose(1, 0), quant_type, group_size, act_order,
+            test_perm)
+        w_ref1_l.append(w_ref1)
+        qweight1_l.append(qweight1)
+        scales1_l.append(scales1)
+        g_idx1_l.append(g_idx1)
+        sort_indices1_l.append(sort_indices1)
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    g_idx1 = stack_and_dev(g_idx1_l)
+    sort_indices1 = stack_and_dev(sort_indices1_l)
+
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        test_perm = torch.randperm(n)
+        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+            w2[i].transpose(1, 0), quant_type, group_size, act_order,
+            test_perm)
+        w_ref2_l.append(w_ref2)
+        qweight2_l.append(qweight2)
+        scales2_l.append(scales2)
+        g_idx2_l.append(g_idx2)
+        sort_indices2_l.append(sort_indices2)
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    g_idx2 = stack_and_dev(g_idx2_l)
+    sort_indices2 = stack_and_dev(sort_indices2_l)
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids = fused_topk(a, score, topk, False)
+
+    triton_output = fused_moe(
+        a,
+        w_ref1.transpose(1, 2).contiguous(),
+        w_ref2.transpose(1, 2).contiguous(),
+        score,
+        topk,
+        renormalize=False,
+    )
+    marlin_output = fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        score,
+        g_idx1,
+        g_idx2,
+        sort_indices1,
+        sort_indices2,
+        topk_weights,
+        topk_ids,
+        w1_scale=scales1,
+        w2_scale=scales2,
+    )
+
+    assert compute_max_diff(marlin_output, triton_output) < 4e-2
+
+
+@pytest.mark.skip("This test is here for the sake of debugging, "
+                  "don't run it in automated tests.")
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+def test_marlin_moe_mmm(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+):
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == k:
+            return
+
+    quant_type = scalar_types.uint4b8
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
+
+    w_ref_l = []
+    qweights_l = []
+    scales_l = []
+    g_idx_l = []
+    sort_indices_l = []
+
+    for i in range(w.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
+            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
+        w_ref_l.append(w_ref)
+        qweights_l.append(qweight)
+        scales_l.append(scales)
+        g_idx_l.append(g_idx)
+        sort_indices_l.append(sort_indices)
+
+    w_ref = stack_and_dev(w_ref_l)
+    qweight = stack_and_dev(qweights_l).contiguous()
+    scales = stack_and_dev(scales_l)
+    g_idx = stack_and_dev(g_idx_l)
+    sort_indices = stack_and_dev(sort_indices_l)
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    marlin_output = single_marlin_moe(a,
+                                      qweight,
+                                      scales,
+                                      score,
+                                      g_idx,
+                                      sort_indices,
+                                      topk,
+                                      renormalize=False)
+    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
+
+    assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/tests/weight_loading/models-large.txt b/tests/weight_loading/models-large.txt
new file mode 100644
index 0000000000000..fe76705746766
--- /dev/null
+++ b/tests/weight_loading/models-large.txt
@@ -0,0 +1,3 @@
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
\ No newline at end of file
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index c708e6d5eb897..a90b352a39bca 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -19,8 +19,6 @@ compressed-tensors, nm-testing/tinyllama-oneshot-w8a16-per-channel, main
 compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index fd6f41b90042e..e9b5703ca28be 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -2,16 +2,22 @@
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
-__all__ = ["FusedMoE", "FusedMoEMethodBase", "FusedMoeWeightScaleSupported"]
+__all__ = [
+    "FusedMoE",
+    "FusedMoEMethodBase",
+    "FusedMoeWeightScaleSupported",
+]
 
 if HAS_TRITON:
-
+    from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        fused_marlin_moe, single_marlin_moe)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        fused_experts, fused_marlin_moe, fused_moe, fused_topk,
-        get_config_file_name, grouped_topk)
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk)
 
     __all__ += [
         "fused_marlin_moe",
+        "single_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
new file mode 100644
index 0000000000000..200a6148978aa
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -0,0 +1,219 @@
+"""Fused MoE utilities for GPTQ."""
+import functools
+from typing import Any, Dict, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, moe_align_block_size, try_get_optimal_moe_config)
+
+
+def single_marlin_moe(
+        hidden_states: torch.Tensor,
+        w: torch.Tensor,
+        scales: torch.Tensor,
+        gating_output: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+        override_config: Optional[Dict[str, Any]] = None) -> torch.Tensor:
+    """
+    This function computes the multiplication of hidden_states with expert
+    weights used in Marlin MoE, using weights w and top-k gating mechanism.
+    Its purpose is testing and debugging the fused MoE kernel.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the Marlin Mul.
+    - w (torch.Tensor): The set of expert weights.
+    - scales (torch.Tensor): The quantization scales.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx (torch.Tensor): The act_order indices.
+    - perm (torch.Tensor): The act_order input permutation.
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
+    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w.is_contiguous(), "Expert weights must be contiguous"
+    assert hidden_states.dtype == torch.float16
+
+    M, K = hidden_states.shape
+    E = w.shape[0]
+    N = w.shape[2] // 2
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    # This might not be an optimal config for a single MMM
+    get_config_func = functools.partial(try_get_optimal_moe_config,
+                                        w.shape,
+                                        w.shape,
+                                        topk_ids.shape[1],
+                                        None,
+                                        override_config=override_config,
+                                        is_marlin=True)
+    config = get_config_func(M)
+
+    block_size_m = config['BLOCK_SIZE_M']
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = (N // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
+        g_idx, perm, workspace, M, N, K, True, E, topk, block_size_m, True,
+        False)
+
+    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
+
+
+def fused_marlin_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx1: torch.Tensor,
+    g_idx2: torch.Tensor,
+    perm1: torch.Tensor,
+    perm2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    override_config: Optional[Dict[str, Any]] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx1 (torch.Tensor): The first set of act_order indices.
+    - g_idx2 (torch.Tensor): The second set of act_order indices.
+    - perm1 (torch.Tensor): The first act_order input permutation.
+    - perm2 (torch.Tensor): The second act_order input permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[
+        0], "Number of tokens mismatch"
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[
+        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype == torch.float16
+
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+    topk = topk_ids.shape[1]
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        None,
+        override_config=override_config,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+
+    block_size_m = config["BLOCK_SIZE_M"]
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states,
+        w1,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w1_scale,
+        g_idx1,
+        perm1,
+        workspace,
+        M,
+        2 * N,
+        K,
+        True,
+        E,
+        topk,
+        block_size_m,
+        True,
+        False,
+    )
+
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+
+    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
+        intermediate_cache2,
+        w2,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w2_scale,
+        g_idx2,
+        perm2,
+        workspace,
+        M,
+        K,
+        N,
+        True,
+        E,
+        topk,
+        block_size_m,
+        False,
+        True,
+    )
+
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 05169eaddb256..bd13d8fecbb96 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -323,15 +323,22 @@ def get_moe_configs(E: int, N: int,
     return None
 
 
-def get_default_config(M: int, E: int, N: int, K: int, topk: int,
-                       dtype: Optional[str],
-                       is_marlin: bool) -> Dict[str, int]:
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+) -> Dict[str, int]:
     config = {
         'BLOCK_SIZE_M': 64,
         'BLOCK_SIZE_N': 64,
         'BLOCK_SIZE_K': 32,
         'GROUP_SIZE_M': 8
     }
+    # A heuristic: fused marlin works faster with this config for small M
     if M <= E or (is_marlin and M <= 32):
         config = {
             'BLOCK_SIZE_M': 16,
@@ -342,14 +349,15 @@ def get_default_config(M: int, E: int, N: int, K: int, topk: int,
     return config
 
 
-def try_get_optimal_moe_config(w1_shape: Tuple[int, ...],
-                               w2_shape: Tuple[int, ...],
-                               top_k: int,
-                               dtype: Optional[str],
-                               M: int,
-                               override_config: Optional[Dict[str,
-                                                              Any]] = None,
-                               is_marlin: bool = False):
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    override_config: Optional[Dict[str, Any]] = None,
+    is_marlin: bool = False,
+):
     if override_config:
         config = override_config
     else:
@@ -391,6 +399,7 @@ def fused_topk(
                                         topk,
                                         dtype=torch.int32,
                                         device=hidden_states.device)
+
     ops.topk_softmax(
         topk_weights,
         topk_ids,
@@ -437,113 +446,6 @@ def grouped_topk(hidden_states: torch.Tensor,
     return topk_weights, topk_ids
 
 
-def fused_marlin_moe(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     gating_output: torch.Tensor,
-                     g_idx1: torch.Tensor,
-                     g_idx2: torch.Tensor,
-                     rand_perm1: torch.Tensor,
-                     rand_perm2: torch.Tensor,
-                     topk: int,
-                     custom_routing_function: Optional[Callable] = None,
-                     renormalize: bool = True,
-                     override_config: Optional[Dict[str, Any]] = None,
-                     use_fp8: bool = False,
-                     w1_scale: Optional[torch.Tensor] = None,
-                     w2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """
-    This function computes a Mixture of Experts (MoE) layer using two sets of
-    weights, w1 and w2, and top-k gating mechanism.
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w1 (torch.Tensor): The first set of expert weights.
-    - w2 (torch.Tensor): The second set of expert weights.
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk (int): The number of top-k experts to select.
-    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
-    - override_config (Optional[Dict[str, Any]]): Optional override
-        for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
-    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w1.
-    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w2.
-    Returns:
-    - torch.Tensor: The output tensor after applying the MoE layer.
-    """
-    # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
-    assert hidden_states.shape[
-        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[
-        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
-    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
-    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-
-    #TODO fp8 is not implemented yet
-    assert not use_fp8
-
-    M, K = hidden_states.shape
-    E = w1.shape[0]
-    N = w2.shape[1] * 16
-
-    if custom_routing_function is None:
-        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                            renormalize)
-    else:
-        topk_weights, topk_ids = custom_routing_function(
-            hidden_states, gating_output, topk, renormalize)
-
-    get_config_func = functools.partial(try_get_optimal_moe_config,
-                                        w1.shape,
-                                        w2.shape,
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None,
-                                        override_config=override_config,
-                                        is_marlin=True)
-    config = get_config_func(M)
-
-    block_size_m = config['BLOCK_SIZE_M']
-
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
-
-    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device="cuda",
-                            requires_grad=False)
-
-    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-
-    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
-        hidden_states, w1, sorted_token_ids, topk_weights, topk_ids, w1_scale,
-        g_idx1, rand_perm1, workspace, M, 2 * N, K, True, E, topk,
-        block_size_m, True, False)
-
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-
-    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
-        intermediate_cache2, w2, sorted_token_ids, topk_weights, topk_ids,
-        w2_scale, g_idx2, rand_perm2, workspace, M, K, N, True, E, topk,
-        block_size_m, False, True)
-
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
-
-
 def get_config_dtype_str(dtype: torch.dtype,
                          use_int8_w8a16: Optional[bool] = False,
                          use_fp8_w8a8: Optional[bool] = False):
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 3df0b61a9ebe4..f6c6f5f529408 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -306,10 +306,28 @@ def _load_single_value(self, param: torch.nn.Parameter,
         # Input scales can be loaded directly and should be equal.
         param_data[expert_id] = loaded_weight
 
+    def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
+                    shard_dim: int, loaded_weight: torch.tensor, tp_rank: int):
+
+        if shard_id == "w2":
+            self._load_w2(shard_id=shard_id,
+                          shard_dim=shard_dim,
+                          loaded_weight=loaded_weight,
+                          expert_data=expert_data,
+                          tp_rank=tp_rank)
+        else:
+            assert shard_id in ("w1", "w3")
+            expert_data.copy_(loaded_weight)
+
     def weight_loader(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, weight_name: str,
                       shard_id: str, expert_id: int) -> None:
 
+        # compressed-tensors represents weights on disk which are flipped
+        loaded_weight = loaded_weight.t().contiguous() if (
+            self.quant_method.__class__.__name__
+            == "CompressedTensorsMoEMethod") else loaded_weight
+
         if shard_id not in ("w1", "w2", "w3"):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
                              f"got {shard_id}.")
@@ -325,19 +343,41 @@ def weight_loader(self, param: torch.nn.Parameter,
         expert_data = param.data[expert_id]
         tp_rank = get_tensor_model_parallel_rank()
 
-        # is_transposed: whether or not the parameter is transposed on disk
-        # If transposed, the loaded weight will be transposed and the dim
-        # to shard the loaded weight will be flipped.
+        # is_transposed: if the dim to shard the weight
+        # should be flipped. Required by GPTQ, compressed-tensors
+        # should be whatever dimension intermediate_size is
         is_transposed = getattr(param, "is_transposed", False)
         shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
         if is_transposed:
-            loaded_weight = loaded_weight.t().contiguous()
             shard_dim = ~shard_dim
 
-        # Case weight_scales
-        if "weight_scale" in weight_name:
-            # load the weight scaling based on the quantization scheme
-            # supported weight scales can be found in
+        # Case input scale: input_scale loading is only supported for fp8
+        if "input_scale" in weight_name:
+            if param.data[expert_id] != 1 and (param.data[expert_id] -
+                                               loaded_weight).abs() > 1e-5:
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param.data[expert_id]} "
+                    f"vs. {loaded_weight}")
+
+            self._load_single_value(param=param,
+                                    loaded_weight=loaded_weight,
+                                    expert_id=expert_id)
+            return
+
+        # Case g_idx
+        if "g_idx" in weight_name:
+            self._load_g_idx(shard_dim=0,
+                             shard_id=shard_id,
+                             loaded_weight=loaded_weight,
+                             expert_data=expert_data,
+                             tp_rank=tp_rank)
+            return
+
+        # Case weight scales and zero_points
+        if ("scale" in weight_name or "zero" in weight_name):
+            # load the weight scales and zp based on the quantization scheme
+            # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported
             # TODO @dsikka: once hardened, refactor to use vLLM Parameters
             # specific to each case
@@ -366,22 +406,9 @@ def weight_loader(self, param: torch.nn.Parameter,
                     f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
             return
 
+        # Case weight_shape
         if "weight_shape" in weight_name:
-            self._load_single_value(param=param,
-                                    loaded_weight=loaded_weight,
-                                    expert_id=expert_id)
-            return
-
-        # Case input scale
-        if "input_scale" in weight_name:
-            # Note: input_scale loading is only supported for fp8
-            if param.data[expert_id] != 1 and (param.data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param.data[expert_id]} "
-                    f"vs. {loaded_weight}")
-
+            # only required by compressed-tensors
             self._load_single_value(param=param,
                                     loaded_weight=loaded_weight,
                                     expert_id=expert_id)
@@ -498,4 +525,4 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
                 param_data[expert_id][idx] = loaded_weight
             # If we are in the row parallel case (down_proj)
             else:
-                param_data[expert_id] = loaded_weight
\ No newline at end of file
+                param_data[expert_id] = loaded_weight
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 36323493d601e..49c29c2775cb6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -5,9 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    WNA16_SUPPORTED_BITS)
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat)
 from vllm.model_executor.utils import set_weight_attrs
@@ -40,11 +38,10 @@ def __init__(
 
         if not (self.quant_config.quant_format
                 == CompressionFormat.pack_quantized.value
-                and self.num_bits in WNA16_SUPPORTED_BITS):
+                and self.num_bits == 4):
             raise ValueError("For Fused MoE layers, only ",
                              f"{CompressionFormat.pack_quantized.value} ",
-                             "is supported for the following bits: ",
-                             f"{WNA16_SUPPORTED_BITS}")
+                             "is supported for 4 bits")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size: int,
@@ -269,19 +266,30 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
 
-        from vllm.model_executor.layers.fused_moe.fused_moe import (
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
             fused_marlin_moe)
 
-        return fused_marlin_moe(x,
-                                layer.w13_weight_packed,
-                                layer.w2_weight_packed,
-                                router_logits,
-                                layer.w13_g_idx,
-                                layer.w2_g_idx,
-                                layer.w13_g_idx_sort_indices,
-                                layer.w2_g_idx_sort_indices,
-                                top_k,
-                                custom_routing_function,
-                                renormalize=renormalize,
-                                w1_scale=layer.w13_weight_scale,
-                                w2_scale=layer.w2_weight_scale)
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function)
+
+        return fused_marlin_moe(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            topk_weights,
+            topk_ids,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 8897737c1c55a..3cade3d3fbcd0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -22,7 +22,7 @@
 __all__ = ["CompressedTensorsWNA16"]
 WNA16_SUPPORTED_TYPES_MAP = {
     4: scalar_types.uint4b8,
-    8: scalar_types.uint8b128,
+    8: scalar_types.uint8b128
 }
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index b06ff7bd2bace..3617a32f80fc1 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,18 +1,22 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
-    marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
-    verify_marlin_supported, verify_marlin_supports_shape)
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
+    marlin_permute_scales, marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx, replace_tensor, verify_marlin_supported,
+    verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
@@ -33,8 +37,14 @@ class GPTQMarlinConfig(QuantizationConfig):
         (8, True): scalar_types.uint8b128,
     }
 
-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool) -> None:
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+    ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -105,11 +115,14 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQMarlinLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return GPTQMarlinMoEMethod(self)
         return None
 
     def get_scaled_act_names(self) -> List[str]:
@@ -179,7 +192,8 @@ def create_weights(
             output_size_per_partition=output_size_per_partition,
             input_size_per_partition=input_size_per_partition,
             input_size=input_size,
-            group_size=group_size)
+            group_size=group_size,
+        )
 
         # Determine sharding
         if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
@@ -299,7 +313,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.quant_type.size_bits)
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -308,7 +323,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=(layer.input_size if self.quant_config.desc_act else
                     layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
-            group_size=self.quant_config.group_size)
+            group_size=self.quant_config.group_size,
+        )
         replace_tensor(layer, "scales", marlin_scales)
 
     def apply(
@@ -329,4 +345,270 @@ def apply(
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,
-            bias=bias)
+            bias=bias,
+        )
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Currently assuming is_k_full is always True
+        # (input size per partition is the same as full input size)
+        # Supports only sym for now (no zp)
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            scales_size2 = intermediate_size // self.quant_config.group_size
+            strategy = FusedMoeWeightScaleSupported.GROUP.value
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
+
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": True
+        })
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size // self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size,
+                        dtype=torch.half),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size,
+                        dtype=torch.half),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+            replace_tensor(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_tensor(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_tensor(layer, "w13_g_idx_sort_indices",
+                           w13_g_idx_sort_indices)
+            replace_tensor(layer, "w2_g_idx_sort_indices",
+                           w2_g_idx_sort_indices)
+        else:
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_tensor(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_tensor(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_tensor(layer, "w2_scales", marlin_w2_scales)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            fused_marlin_moe)
+
+        # The input must currently be float16
+        orig_dtype = x.dtype
+        x = x.half()
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=None)
+
+        return fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            topk_weights,
+            topk_ids,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+        ).to(orig_dtype)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 0ec68ac5b0f21..699d5f1844146 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -176,6 +176,23 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
     return s
 
 
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
 def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
                        num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 7d08ac6f87469..4a06c5d63d52d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,6 +1,6 @@
 """Utility functions used for tests and benchmarks"""
 
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 import torch
@@ -92,8 +92,11 @@ def get_weight_perm(num_bits: int):
     return perm
 
 
-def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
-                    act_order: bool):
+def marlin_quantize(w: torch.Tensor,
+                    quant_type: ScalarType,
+                    group_size: int,
+                    act_order: bool,
+                    test_perm: Optional[torch.Tensor] = None):
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
 
@@ -104,7 +107,7 @@ def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
 
     # Quantize (and apply act_order if provided)
     w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
-        w, quant_type, group_size, act_order)
+        w, quant_type, group_size, act_order, test_perm)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 33f24ff5d54d3..bdfda31de852b 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,5 @@
 """This file is used for /tests and /benchmarks"""
-from typing import List
+from typing import List, Optional
 
 import numpy
 import torch
@@ -53,7 +53,10 @@ def get_pack_factor(num_bits):
     return 32 // num_bits
 
 
-def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
+def permute_rows(q_w: torch.Tensor,
+                 w_ref: torch.Tensor,
+                 group_size: int,
+                 test_perm: Optional[torch.Tensor] = None):
     assert q_w.shape == w_ref.shape
 
     orig_device = q_w.device
@@ -64,7 +67,7 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
         g_idx[i] = i // group_size
 
     # Simulate act_order by doing a random permutation on K
-    rand_perm = torch.randperm(k_size)
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
 
     g_idx = g_idx[rand_perm].contiguous()
     q_w = q_w[rand_perm, :].contiguous()
@@ -164,8 +167,11 @@ def reshape_w(w):
     )
 
 
-def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
-                          group_size: int, act_order: bool):
+def gptq_quantize_weights(w: torch.Tensor,
+                          quant_type: ScalarType,
+                          group_size: int,
+                          act_order: bool,
+                          test_perm: Optional[torch.Tensor] = None):
     size_k, _ = w.shape
 
     assert w.is_floating_point(), "w must be float"
@@ -186,7 +192,8 @@ def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
         ), "For act_order, groupsize = {} must be less than size_k = {}".format(
             group_size, size_k)
 
-        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size)
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size,
+                                                    test_perm)
 
     return w_ref, w_q, w_s, g_idx, rand_perm
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 4bb943ab3afe4..0052489d99dc4 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -24,10 +24,18 @@ def get_model_architecture(
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     mixtral_supported = ["fp8", "compressed-tensors"]
+    # for gptq_marlin, only run fused MoE for int4
+    if model_config.quantization == "gptq_marlin":
+        hf_quant_config = getattr(model_config.hf_config,
+                                  "quantization_config", None)
+        if hf_quant_config and hf_quant_config.get("bits") == 4:
+            mixtral_supported.append("gptq_marlin")
+
     if (model_config.quantization is not None
             and model_config.quantization not in mixtral_supported
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
+
     return ModelRegistry.resolve_model_cls(architectures)
 
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e744e36ac08bf..10cbfcf6432b3 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -435,7 +435,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -454,6 +455,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
@@ -464,7 +468,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):

From a1d874224d9c29ae84f3850474b4816f0ed9574b Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 9 Sep 2024 23:21:00 -0700
Subject: [PATCH 6/7] Add NVIDIA Meetup slides, announce AMD meetup, and add
 contact info (#8319)

---
 README.md                         | 16 ++++++++++++----
 docs/source/community/meetups.rst |  1 +
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9ae30f8d2de55..53749cb36b972 100644
--- a/README.md
+++ b/README.md
@@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
+**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
 
-We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
-Join us to hear the vLLM's recent update about performance.
-Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
+We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
+Join us to learn more about recent advancements of vLLM on MI300X.
+Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
 
 ---
 
 *Latest News* 🔥
+- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
@@ -130,3 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
   year={2023}
 }
 ```
+
+## Contact Us
+
+* For technical questions and feature requests, please use Github issues or discussions.
+* For discussing with fellow users, please use Discord.
+* For security disclosures, please use Github's security advisory feature.
+* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
\ No newline at end of file
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
index 3b01b109ebf2c..a3962e96e7913 100644
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
 - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__

From da1a844e61366b473cef6b3f7437ea5dc41876a1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 10 Sep 2024 16:22:50 +0800
Subject: [PATCH 7/7] [Bugfix] Fix missing `post_layernorm` in CLIP (#8155)

---
 vllm/model_executor/models/clip.py   | 29 +++++++++++++++++++++----
 vllm/model_executor/models/siglip.py | 32 +++++++++++++++-------------
 2 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 70f1522ae2524..078928f281c26 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -355,6 +355,19 @@ def __init__(self,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override)
 
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {config.num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+        elif len(self.encoder.layers) == config.num_hidden_layers:
+            self.post_layernorm = nn.LayerNorm(embed_dim,
+                                               eps=config.layer_norm_eps)
+        else:
+            # post_layernorm is unused when we extract intermediate features
+            # In this case, we can skip it to conserve memory
+            self.post_layernorm = None
+
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -364,7 +377,10 @@ def forward(
         hidden_states = self.pre_layrnorm(hidden_states)
         hidden_states = self.encoder(inputs_embeds=hidden_states)
 
-        return hidden_states
+        if self.post_layernorm is None:
+            return hidden_states
+
+        return self.post_layernorm(hidden_states)
 
 
 class CLIPVisionModel(nn.Module):
@@ -386,9 +402,12 @@ def __init__(self,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override)
 
-    def forward(self, pixel_values: Optional[torch.Tensor] = None):
+    @property
+    def _require_post_layernorm(self) -> bool:
+        return self.vision_model.post_layernorm is not None
 
-        return self.vision_model(pixel_values=pixel_values)
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        return self.vision_model(pixel_values)
 
     @property
     def device(self):
@@ -408,8 +427,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         for name, loaded_weight in weights:
             # post_layernorm is not needed in CLIPVisionModel
-            if "vision_model.post_layernorm" in name:
+            if ("vision_model.post_layernorm" in name
+                    and not self._require_post_layernorm):
                 continue
+
             # omit layers when num_hidden_layers_override is set
             if "vision_model.encoder.layers." in name:
                 layer_idx = int(name.split(".")[3])
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 13d09e4cd4c23..f7976eba7420b 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -443,27 +443,26 @@ def __init__(
         self.config = config
         embed_dim = config.hidden_size
 
-        if (num_hidden_layers_override is None
-                or num_hidden_layers_override == config.num_hidden_layers):
-            self.need_post_layernorm = True
-        elif num_hidden_layers_override > config.num_hidden_layers:
-            raise ValueError(
-                "num_hidden_layers_override cannot be greater than "
-                "num_hidden_layers")
-        else:
-            self.need_post_layernorm = False
-
         self.embeddings = SiglipVisionEmbeddings(config)
         self.encoder = SiglipEncoder(
             config,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
         )
-        if self.need_post_layernorm:
+
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {config.num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+        elif len(self.encoder.layers) == config.num_hidden_layers:
             self.post_layernorm = nn.LayerNorm(embed_dim,
                                                eps=config.layer_norm_eps)
         else:
-            self.post_layernorm = nn.Identity()
+            # post_layernorm is unused when we extract intermediate features
+            # In this case, we can skip it to conserve memory
+            self.post_layernorm = None
+
         self.use_head = (True if not hasattr(config, "vision_use_head") else
                          config.vision_use_head)
         if self.use_head:
@@ -482,6 +481,9 @@ def forward(
 
         encoder_outputs = self.encoder(inputs_embeds=hidden_states)
 
+        if self.post_layernorm is None:
+            return encoder_outputs
+
         last_hidden_state = self.post_layernorm(encoder_outputs)
         # TODO: add this back when pooled_output is used in inference
         # if self.use_head:
@@ -512,8 +514,8 @@ def __init__(
         )
 
     @property
-    def need_post_layernorm(self):
-        return self.vision_model.need_post_layernorm
+    def _require_post_layernorm(self) -> bool:
+        return self.vision_model.post_layernorm is not None
 
     def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
@@ -541,7 +543,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             # post_layernorm is optional in SiglipVisionModel
             if ("vision_model.post_layernorm" in name
-                    and not self.need_post_layernorm):
+                    and not self._require_post_layernorm):
                 continue
 
             # omit layers when num_hidden_layers_override is set